{
 "metadata": {
  "name": "",
  "signature": "sha256:143998b5b555ace035ec2111c21da5fe8c8c7f711e70a2448be1d83d23d925d6"
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "%pylab inline"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "Populating the interactive namespace from numpy and matplotlib\n"
       ]
      }
     ],
     "prompt_number": 1
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "## Exploring the User Dataset"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# replace this PATH with the correct path to the MovieLens dataset on your computer\n",
      "PATH = \"\"\n",
      "user_data = sc.textFile(\"%s/ml-100k/u.user\" % PATH)\n",
      "user_data.first()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 2,
       "text": [
        "u'1|24|M|technician|85711'"
       ]
      }
     ],
     "prompt_number": 2
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "user_fields = user_data.map(lambda line: line.split(\"|\"))\n",
      "num_users = user_fields.map(lambda fields: fields[0]).count()\n",
      "num_genders = user_fields.map(lambda fields: fields[2]).distinct().count()\n",
      "num_occupations = user_fields.map(lambda fields: fields[3]).distinct().count()\n",
      "num_zipcodes = user_fields.map(lambda fields: fields[4]).distinct().count()\n",
      "print \"Users: %d, genders: %d, occupations: %d, ZIP codes: %d\" % (num_users, num_genders, num_occupations, num_zipcodes)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "Users: 943, genders: 2, occupations: 21, ZIP codes: 795\n"
       ]
      }
     ],
     "prompt_number": 3
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "ages = user_fields.map(lambda x: int(x[1])).collect()\n",
      "hist(ages, bins=20, color='lightblue', normed=True)\n",
      "fig = matplotlib.pyplot.gcf()\n",
      "fig.set_size_inches(16, 10)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "display_data",
       "png": "iVBORw0KGgoAAAANSUhEUgAAA6kAAAJPCAYAAACetZKYAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAHHRJREFUeJzt3XGMpPdd3/HP3G7ckITmdgHZnO+qi2zTsxFtnEbmWlKy\ntBA5FrX/uErGUkhrJGwpnBKgAtf8k/NfFPoHkWvVnFoHWQFi1ByJLsLBISUrBUVcMNgXJ/aCz47b\nO29ioDeb1sZVfZvtH8/ju9nx3c3s3dnP99l9vaTRzTzzzO5X/mU389555pkEAAAAAAAAAAAAAAAA\nAAAAAAAAAAAAAAAAgE3sxiRLSZ5Octc59rm3vf9okutHtj+X5KtJHkvylddvRAAAALaCmSTHkuxO\n8qYkjye5dmyfm5I83F7/4SR/OnLfN5LMv74jAgAAsFlsm3D/DWki9bkkryR5KMktY/vcnOTB9vqR\nJNuTXD5y/+CipwQAAGBLmBSpVyY5PnL7RLtt2n3WknwhyaNJfvbCxwQAAGArmJ1w/9qUX+dcr5a+\nJ8lyku9L8kdp3tv6pSm/JgAAAFvMpEh9Psmukdu70rxSer59drbbkiZQk+Rvknw6zeHD6yL1qquu\nWnvmmWc2MDIAAAA98kySq6fdedLhvo8muSbNiZMuS3JrksNj+xxO8sH2+t4kK0leSPKWJN/dbn9r\nkvcleeI10z7zTNbW1lx6evnoRz/a+Qwu1m8rXqxdvy/Wr78Xa9fvi/Xr98X69feS5KppAzWZ/Erq\nqST7kzyS5ky/DyR5Ksmd7f0H05zZ96Y0J1h6Kcnt7X1XJPn9ke/zO0k+v5HhAAAA2FomRWqSfK69\njDo4dnv/WR73bJJ3XshQAAAAbE2TDveF81pYWOh6BC6C9esva9dv1q+/rF2/Wb9+s35bR4XPMF1r\nj1MGAABgkxkMBskG2tMrqQAAAJQhUgEAAChDpAIAAFCGSAUAAKAMkQoAAEAZIhUAAIAyRCoAAABl\niFQAAADKEKkAAACUIVIBAAAoQ6QCAABQhkgFAACgDJEKAABAGSIVAACAMkQqAAAAZYhUAAAAyhCp\nAAAAlCFSAQAAKEOkAgAAUIZIBQAAoAyRCgAAQBkiFQAAgDJEKgAAAGWIVAAAAMoQqQAAAJQhUgEA\nAChDpAIAAFCGSAUAAKAMkQoAAEAZIhUAAIAyRCoAAABliFQAAADKEKkAAACUIVIBAAAoQ6QCAABQ\nhkgFAACgDJEKAABAGSIVAACAMkQqAAAAZYhUAAAAyhCpAAAAlCFSAQAAKEOkAgAAUMZs1wMA9MHc\n/HxWhsOux1hn+9xchidPdj0GAMAlNeh6gCRra2trXc8AcF6DwSCHlpa7HmOdfXt2xO9PAKC6wWCQ\nbKA9He4LAABAGSIVAACAMkQqAAAAZYhUAAAAyhCpAAAAlCFSAQAAKEOkAgAAUIZIBQAAoAyRCgAA\nQBkiFQAAgDJEKgAAAGWIVAAAAMoQqQAAAJQhUgEAAChDpAIAAFCGSAUAAKAMkQoAAEAZIhUAAIAy\nRCoAAABliFQAAADKEKkAAACUIVIBAAAoQ6QCAABQhkgFAACgDJEKAABAGSIVAACAMkQqAAAAZYhU\nAAAAyhCpAAAAlCFSAQAAKEOkAgAAUIZIBQAAoAyRCgAAQBkiFQAAgDJEKgAAAGWIVAAAAMoQqQAA\nAJQhUgEAAChDpAIAAFCGSAUAAKAMkQoAAEAZIhUAAIAyRCoAAABliFQAAADKEKkAAACUIVIBAAAo\nQ6QCAABQhkgFAACgDJEKAABAGSIVAACAMkQqAAAAZYhUAAAAyhCpAAAAlCFSAQAAKEOkAgAAUIZI\nBQAAoIxpIvXGJEtJnk5y1zn2ube9/2iS68fum0nyWJLPXuCMAAAAbBGTInUmyX1pQvW6JLcluXZs\nn5uSXJ3kmiR3JLl/7P6PJHkyydrFDgsAAMDmNilSb0hyLMlzSV5J8lCSW8b2uTnJg+31I0m2J7m8\nvb0zTcT+1ySDix8XAACAzWxSpF6Z5PjI7RPttmn3+Y0kv5TkOxcxIwAAAFvE7IT7pz1Ed/xV0kGS\nn0zy12nej7pwvgcfOHDg9PWFhYUsLJx3dwAAAIpaXFzM4uLiBT9+0iG4e5McSPOe1CS5O82ror82\nss9vJllMcyhw0pxkaSHJh5P8dJJTSd6c5O8nOZTkg2PfY21tzdtVgdoGg0EOLS13PcY6+/bsiN+f\nAEB1g8Eg2cDbPycd7vtomhMi7U5yWZJbkxwe2+dwzoTn3iQrSb6V5FeS7EryjiQ/leSP89pABQAA\ngNMmHe57Ksn+JI+kOdPvA0meSnJne//BJA+nOTnSsSQvJbn9HF/Ln/sBAAA4rwpn3HW4L1Cew30B\nAC7MpT7cFwAAAN4wIhUAAIAyRCoAAABliFQAAADKEKkAAACUIVIBAAAoQ6QCAABQhkgFAACgDJEK\nAABAGSIVAACAMkQqAAAAZYhUAAAAyhCpAAAAlCFSAQAAKEOkAgAAUIZIBQAAoAyRCgAAQBkiFQAA\ngDJEKgAAAGWIVAAAAMoQqQAAAJQhUgEAAChDpAIAAFCGSAUAAKAMkQoAAEAZIhUAAIAyRCoAAABl\niFQAAADKEKkAAACUIVIBAAAoQ6QCAABQhkgFAACgDJEKAABAGSIVAACAMkQqAAAAZYhUAAAAyhCp\nAAAAlCFSAQAAKEOkAgAAUIZIBQAAoAyRCgAAQBkiFQAAgDJEKgAAAGWIVAAAAMoQqQAAAJQhUgEA\nAChDpAIAAFCGSAUAAKAMkQoAAEAZIhUAAIAyRCoAAABliFQAAADKEKkAAACUIVIBAAAoQ6QCAABQ\nhkgFAACgDJEKAABAGSIVAACAMkQqAAAAZYhUAAAAyhCpAAAAlCFSAQAAKEOkAgAAUIZIBQAAoAyR\nCgAAQBmzXQ8AcDZz8/NZGQ67HgMAgDeYSAVKWhkOc2hpuesxTtu3Z0fXIwAAbAkO9wUAAKAMkQoA\nAEAZIhUAAIAyRCoAAABliFQAAADKEKkAAACUIVIBAAAoQ6QCAABQhkgFAACgDJEKAABAGSIVAACA\nMkQqAAAAZYhUAAAAyhCpAAAAlCFSAQAAKEOkAgAAUIZIBQAAoAyRCgAAQBkiFQAAgDJEKgAAAGWI\nVAAAAMoQqQAAAJQhUgEAAChDpAIAAFCGSAUAAKAMkQoAAEAZIhUAAIAyRCoAAABliFQAAADKEKkA\nAACUIVIBAAAoQ6QCAABQhkgFAACgDJEKAABAGSIVAACAMqaJ1BuTLCV5Osld59jn3vb+o0mub7e9\nOcmRJI8neTLJr17UpAAAAGx6kyJ1Jsl9aUL1uiS3Jbl2bJ+bklyd5JokdyS5v93+f5P8WJJ3JvlH\n7fX3XJKpAQAA2JQmReoNSY4leS7JK0keSnLL2D43J3mwvX4kyfYkl7e3/67997I0wXvy4sYFAABg\nM5sUqVcmOT5y+0S7bdI+O9vrM2kO930hyRfTHPYLAAAAZzUpUtem/DqDczxuNc3hvjuT/GiShakn\nAwAAYMuZnXD/80l2jdzeleaV0vPts7PdNurbSf4gybuTLI5/kwMHDpy+vrCwkIWFhQljAQAAUNHi\n4mIWFxcv+PHjr4COm03yl0n+ZZLlJF9Jc/Kkp0b2uSnJ/vbfvUk+1v77vUlOJVlJ8l1JHklyT5L/\nPvY91tbWpn3BFtgqBoNBDi0tdz3Gafv27Cg1T9LM5PcnAFDdYDBIJrfnaZNeST2VJkAfSfP+0gfS\nBOqd7f0HkzycJlCPJXkpye3tfd+f5oRK29rLJ/LaQAUAAIDTJkVqknyuvYw6OHZ7/1ke90SSd13I\nUAAAAGxNk06cBAAAAG+YaV5JBTa5ufn5rAyHXY8BAAAiFUhWhsOSJwUCAGDrcbgvAAAAZYhUAAAA\nyhCpAAAAlCFSAQAAKEOkAgAAUIZIBQAAoAyRCgAAQBkiFQAAgDJEKgAAAGWIVAAAAMoQqQAAAJQh\nUgEAAChDpAIAAFCGSAUAAKAMkQoAAEAZIhUAAIAyZrseALaaufn5rAyHXY8BAAAliVR4g60Mhzm0\ntNz1GOvs27Oj6xEAACCJw30BAAAoRKQCAABQhkgFAACgDJEKAABAGSIVAACAMkQqAAAAZYhUAAAA\nyhCpAAAAlCFSAQAAKEOkAgAAUIZIBQAAoAyRCgAAQBkiFQAAgDJEKgAAAGWIVAAAAMoQqQAAAJQh\nUgEAAChDpAIAAFCGSAUAAKAMkQoAAEAZIhUAAIAyRCoAAABliFQAAADKEKkAAACUIVIBAAAoQ6QC\nAABQxmzXAwBwYWZmZjIYDLoeY53tc3MZnjzZ9RgAQI+JVICeWl1dzaGl5a7HWGffnh1djwAA9JzD\nfQEAAChDpAIAAFCGSAUAAKAMkQoAAEAZIhUAAIAyRCoAAABliFQAAADKEKkAAACUIVIBAAAoQ6QC\nAABQhkgFAACgDJEKAABAGSIVAACAMkQqAAAAZYhUAAAAyhCpAAAAlCFSAQAAKEOkAgAAUIZIBQAA\noAyRCgAAQBkiFQAAgDJEKgAAAGWIVAAAAMoQqQAAAJQhUgEAAChDpAIAAFCGSAUAAKAMkQoAAEAZ\nIhUAAIAyRCoAAABliFQAAADKEKkAAACUIVIBAAAoQ6QCAABQhkgFAACgDJEKAABAGSIVAACAMkQq\nAAAAZYhUAAAAyhCpAAAAlCFSAQAAKEOkAgAAUIZIBQAAoAyRCgAAQBkiFQAAgDJEKgAAAGWIVAAA\nAMoQqQAAAJQhUgEAAChjtusBANg8ZmZmMhgMuh5jne1zcxmePNn1GOvMzc9nZTjseox1Kv53AmBr\nEqkAXDKrq6s5tLTc9Rjr7Nuzo+sRXmNlOPTfCQDOweG+AAAAlCFSAQAAKEOkAgAAUMa0kXpjkqUk\nTye56xz73NvefzTJ9e22XUm+mOTrSb6W5MMXPCkAAACb3jSROpPkvjShel2S25JcO7bPTUmuTnJN\nkjuS3N9ufyXJLyT5wSR7k/zcWR4LAAAASaaL1BuSHEvyXJrofCjJLWP73Jzkwfb6kSTbk1ye5FtJ\nHm+3v5jkqSROHwgAAMBZTROpVyY5PnL7RLtt0j47x/bZneYw4CMbGxEAAICtYppIXZvya41/evvo\n496W5FNJPpLmFVUAAAB4jdkp9nk+zQmQXrUrzSul59tnZ7stSd6U5FCS307ymbN9gwMHDpy+vrCw\nkIWFhSnGAgAAoJrFxcUsLi5e8OOnidRH05wQaXeS5SS3pjl50qjDSfaneb/q3iQrSV5I8+rqA0me\nTPKxc32D0UgFAACgv8ZfeLznnns29PhpIvVUmgB9JM2Zfh9IcwKkO9v7DyZ5OM0Zfo8leSnJ7e19\nP5LkA0m+muSxdtvdSf5wQ1MCAACwJUwTqUnyufYy6uDY7f1nedyfZPrPYgUAAGCLE5AAAACUIVIB\nAAAoQ6QCAABQhkgFAACgDJEKAABAGSIVAACAMkQqAAAAZYhUAAAAyhCpAAAAlCFSAQAAKEOkAgAA\nUMZs1wMAwOtpZmYmg8Gg6zHYBObm57MyHHY9xjrb5+YyPHmy6zEALimRCsCmtrq6mkNLy12Psc6+\nPTu6HoELsDIc+t8SwBvA4b4AAACUIVIBAAAoQ6QCAABQhkgFAACgDJEKAABAGSIVAACAMkQqAAAA\nZYhUAAAAyhCpAAAAlCFSAQAAKEOkAgAAUIZIBQAAoAyRCgAAQBkiFQAAgDJEKgAAAGWIVAAAAMqY\n7XoAAIBxc/PzWRkOux4DgA6IVACgnJXhMIeWlrseY519e3Z0PQLAluBwXwAAAMoQqQAAAJQhUgEA\nAChDpAIAAFCGSAUAAKAMkQoAAEAZIhUAAIAyRCoAAABlzHY9AADQvZmZmQwGg67HAACRCgAkq6ur\nObS03PUYp+3bs6PrEQDoiMN9AQAAKEOkAgAAUIZIBQAAoAyRCgAAQBkiFQAAgDJEKgAAAGWIVAAA\nAMoQqQAAAJQhUgEAAChDpAIAAFCGSAUAAKAMkQoAAEAZIhUAAIAyRCoAAABliFQAAADKEKkAAACU\nIVIBAAAoQ6QCAABQhkgFAACgDJEKAABAGSIVAACAMkQqAAAAZYhUAAAAyhCpAAAAlCFSAQAAKEOk\nAgAAUIZIBQAAoAyRCgAAQBkiFQAAgDJEKgAAAGWIVAAAAMoQqQAAAJQhUgEAAChDpAIAAFCGSAUA\nAKAMkQoAAEAZIhUAAIAyRCoAAABliFQAAADKEKkAAACUIVIBAAAoQ6QCAABQhkgFAACgDJEKAABA\nGSIVAACAMkQqAAAAZYhUAAAAyhCpAAAAlCFSAQAAKEOkAgAAUIZIBQAAoAyRCgAAQBkiFQAAgDJE\nKgAAAGWIVAAAAMoQqQAAAJQhUgEAAChDpAIAAFCGSAUAAKAMkQoAAEAZIhUAAIAyRCoAAABlTBOp\nNyZZSvJ0krvOsc+97f1Hk1w/sv3jSV5I8sRFzAgAAMAWMSlSZ5LclyZUr0tyW5Jrx/a5KcnVSa5J\nckeS+0fu+632sQAAADDRpEi9IcmxJM8leSXJQ0luGdvn5iQPttePJNme5Ir29peSDC/FoAAAAGx+\nkyL1yiTHR26faLdtdB8AAACYaHbC/WtTfp3BBT4uSXLgwIHT1xcWFrKwsLCRhwMAAFDE4uJiFhcX\nL/jxkyL1+SS7Rm7vSvNK6fn22dlum9popAIAANBf4y883nPPPRt6/KTDfR9Nc0Kk3UkuS3JrksNj\n+xxO8sH2+t4kK2nO6AsAAAAbMilSTyXZn+SRJE8m+b0kTyW5s70kycNJnk1zgqWDST408vhPJvly\nkh9I877V2y/V4AAAAGw+kw73TZLPtZdRB8du7z/HY2/b8EQAAABsWZNeSQUAAIA3jEgFAACgDJEK\nAABAGSIVAACAMkQqAAAAZYhUAAAAyhCpAAAAlCFSAQAAKEOkAgAAUIZIBQAAoAyRCgAAQBkiFQAA\ngDJEKgAAAGWIVAAAAMoQqQAAAJQhUgEAAChDpAIAAFCGSAUAAKAMkQoAAEAZIhUAAIAyRCoAAABl\niFQAAADKEKkAAACUIVIBAAAoQ6QCAABQhkgFAACgDJEKAABAGSIVAACAMkQqAAAAZYhUAAAAyhCp\nAAAAlDHb9QAAAFyYmZmZDAaDrsdYZ/vcXIYnT3Y9BtBjIhUAoKdWV1dzaGm56zHW2bdnR9cjAD3n\ncF8AAADKEKkAAACUIVIBAAAoQ6QCAABQhkgFAACgDJEKAABAGSIVAACAMkQqAAAAZYhUAAAAyhCp\nAAAAlCFSAQAAKEOkAgAAUIZIBQAAoIzZrgcAAIDX09z8fFaGw67HWGf73FyGJ092PQaUJFIBANjU\nVobDHFpa7nqMdfbt2dH1CFCWw30BAAAoQ6QCAABQhkgFAACgDJEKAABAGSIVAACAMpzdFwCAS2Zm\nZiaDwaDrMYAeE6kAAFwyq6urPu4FuCgO9wUAAKAMkQoAAEAZDvdlU5ubn8/KcNj1GAAAwJREKpva\nynDofTEAANAjDvcFAACgDJEKAABAGSIVAACAMkQqAAAAZYhUAAAAyhCpAAAAlCFSAQAAKEOkAgAA\nUIZIBQAAoAyRCgAAQBkiFQAAgDJEKgAAAGWIVAAAAMoQqQAAAJQhUgEAAChDpAIAAFCGSAUAAKAM\nkQoAAEAZIhUAAIAyRCoAAABliFQAAADKEKkAAACUIVIBAAAoQ6QCAABQhkgFAACgDJEKAABAGbNd\nD8DmMTc/n5XhsOsxAADKm5mZyWAw6HqM07bPzWV48mTXY0ASkcoltDIc5tDSctdjrLNvz46uRwAA\neI3V1dVSz5s8Z6ISh/sCAABQhkgFAACgDJEKAABAGSIVAACAMkQqAAAAZTi7LwAAbHHVPhInSWZm\nZ7N66lTXY6zjo3reGCIVAAC2uGofiZM0H4tTcSZefw73BQAAoAyRCgAAQBkiFQAAgDJEKgAAAGWI\nVAAAAMpwdt8eevnll/Oud787L774UtejAAAAXFIitYdefvnlnDh+Ir/+6c93PcppTx/9i/zGv/tQ\n12MAAAA9J1J7atvMtly+8x90PcZpf/vNWp9hBQAA9NM070m9MclSkqeT3HWOfe5t7z+a5PoNPhYA\nAACSTI7UmST3pYnN65LcluTasX1uSnJ1kmuS3JHk/g08lp772pEvdz0CF8H69Ze1g2742es369dv\n1m/rmBSpNyQ5luS5JK8keSjJLWP73Jzkwfb6kSTbk1wx5WPpua9/xS+LPrN+/WXtoBt+9vrN+vWb\n9ds6JkXqlUmOj9w+0W6bZp8dUzwWAAAATpt04qS1Kb/O4GIHYXrbtm3L3734Yv7jh/5t16Pk+LPH\n8j+f+lr+98rJrkcBAAA2gUlxuTfJgTTvK02Su5N8J8mvjezzm0kW0xzOmzQnSnpvkndM8dikOST4\nqo0ODgAAQC88k+Y8RpfEbPsFdye5LMnjOfuJkx5ur+9N8qcbeCwAAABsyPuT/GWaVzzvbrfd2V5e\ndV97/9Ek75rwWAAAAAAAAADO58Y072F9OsldHc/C+X08yQtJnhjZNp/kj5L8VZLPp/n4IWraleSL\nSb6e5GtJPtxut4b1vTnNx3s9nuTJJL/abrd2/TKT5LEkn21vW7/+eC7JV9Os31fabdavH7Yn+VSS\np9L8/vzhWLu++IdpfuZevXw7zXMX69cPd6d5zvlEkt9N8vfSo7WbSXMY8O4kb4r3rFb3z5Ncn/WR\n+utJfrm9fleS//BGD8XUrkjyzvb629Ichn9trGFfvKX9dzbN+/7fE2vXN7+Y5HeSHG5vW7/++Eaa\nJ1ejrF8/PJjkZ9rrs0neHmvXR9uSfDPNH9ytX327kzybJkyT5PeS/Jv0aO3+aZI/HLn979sLde3O\n+khdSnJ5e/2K9jb98JkkPx5r2DdvSfJnSX4w1q5Pdib5QpIfy5lXUq1ff3wjyfeMbbN+9b09zRPl\ncdauf96X5EvtdetX33yaF0Pm0vxx6LNJfiI9Wrt/neS/jNz+QJL/1NEsTGd31kfqcOT6YOw2de1O\n8j+SfHesYV9sS3O0yf9J85fIxNr1yX9LcyTKe3MmUq1ffzyb5nDDR5P8bLvN+tX3zjRvlfitJH+R\n5jnnW2Pt+ujjST7UXrd+/XBHmucsf53kE+22Da3dttdnrqmsdfi9ufTWYk374G1JDiX5SJpfHqOs\nYV3fSfOEa2eSH03zitwoa1fXT6b5P+nHcu7PJrd+tf1Imj8yvD/Jz6V5+8so61fTbJpPnPjP7b8v\n5bVH7Fm7+i5L8q/S/LFvnPWr6aokP5/mRZEdaZ57fmBsn4lr12WkPp/m2PJX7UpyoqNZuDAvpHm5\nPkm+P80TMep6U5pA/USaw30Ta9g3307yB0n+SaxdX/yzJDenOWT0k0n+RZqfQevXH99s//2bJJ9O\nckOsXx+caC9/1t7+VJpY/VasXZ+8P8mfp/n5S/zs9cG7k3w5yf9KcirJ76d5m+eGfva6jNRHk1yT\nprIvS3JrzpxQgn44nOaN0Gn//cx59qVbgyQPpDm74cdGtlvD+r43Z86A911p3tfxWKxdX/xKmj/C\nviPJTyX54yQ/HevXF29J89aIpDlU9H1p3vZi/er7VpLjSX6gvf3jac42+tlYuz65Lc0f+F7lZ6++\npSR70zxnGaT52XsyPfvZe3+aN9YeS3OqYur6ZJLlJP8vzS/929O8MfoL6cGppMl70hwy+njOnM79\nxljDPvihNO+nejzNx2D8Urvd2vXPe3Pmj7HWrx/ekeZn7/E0H9/16nMV69cP/zjNK6lH07ya8/ZY\nuz55a5K/zZk/FCXWry9+OWc+gubBNEfzWTsAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\nALam/w/9oAlN7ovWnQAAAABJRU5ErkJggg==\n",
       "text": [
        "<matplotlib.figure.Figure at 0x108dbbcd0>"
       ]
      }
     ],
     "prompt_number": 4
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "count_by_occupation = user_fields.map(lambda fields: (fields[3], 1)).reduceByKey(lambda x, y: x + y).collect()\n",
      "x_axis1 = np.array([c[0] for c in count_by_occupation])\n",
      "y_axis1 = np.array([c[1] for c in count_by_occupation])\n",
      "x_axis = x_axis1[np.argsort(y_axis1)]\n",
      "y_axis = y_axis1[np.argsort(y_axis1)]\n",
      "\n",
      "pos = np.arange(len(x_axis))\n",
      "width = 1.0\n",
      "\n",
      "ax = plt.axes()\n",
      "ax.set_xticks(pos + (width / 2))\n",
      "ax.set_xticklabels(x_axis)\n",
      "\n",
      "plt.bar(pos, y_axis, width, color='lightblue')\n",
      "plt.xticks(rotation=30)\n",
      "fig = matplotlib.pyplot.gcf()\n",
      "fig.set_size_inches(16, 10)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "display_data",
       "png": "iVBORw0KGgoAAAANSUhEUgAAA6AAAAJxCAYAAABc0N+PAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzs3Xm4ZVV9J/zvrSooBgWqBIFikNkSIYADEki0FEVIBAei\n4Dy1ihinxDgTyzE4IcYxKg4RRVBEcULBEcUWRwRtjHZrvyEYMhVvuvt9ujsq7x+/s93nHm5JUffc\ndc4pPp/nuU/dc+65ddbdZ++113ettddOAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAALZoeyX5cpIf\nJbkmybMGz69OcmmSv0vyhSQ7Df3Oi5L8NMm1SY5rVlIAAABm2m5JDh98f7skP0lylySvS/L8wfMv\nSHLm4PuDk/wgyVZJ9knysyTLGpUVAACALcgnktw/Nbq56+C53QaPkxr9fMHQ6y9JclSz0gEAADC1\nbs3o5D5JjkjyrVT4vGHw/A3pw+iaJNcN/c51SfZYXBEBAADYEmxqAL1dkguTPDvJ/xj52U2Dr435\nXT8DAADgNmLFJrxmq1T4/GBqCm5So567JfnHJLsn+afB8/+QWrios+fguXkOO+ywm6666qrNLDIA\nAABT7qr06wn91i2NgM4lOSfJj5OcPfT8xUkeP/j+8emD6cVJTk2ydZJ9kxyY5MqbleSqq3LTTTeN\n9etlL3vZ2P/PpfqalbLOSjlnqayzUs5ZKuuslHOWyjor5Zylss5KOWeprLNSzlkq66yUc5bKOivl\nnKWyzko5Z6msS1XOJIctFDBvaQT0mCSPSfLDJN8fPPei1Kq3FyR5cpJfJHnE4Gc/Hjz/4yS/SnJ6\nTMEFAAAgtxxAv56Nj5LefyPPv2bwBQAAAL+1fELvu379+vVj/0/32Wefsf+fS2VWyjor5Uxmp6yz\nUs5kdso6K+VMZqess1LOZHbKOivlTGanrLNSzmR2yjor5Uxmp6yzUs5kdso6K+VMZqesS1HOl7/8\n5Uny8tHn58b+TpvmpsG8YAAAALYwc3NzyQJ589bcBxQAAAA2mwAKAABAEwIoAAAATQigAAAANCGA\nAgAA0MQt3QcUAACAKbBq9ercuGHDpIuxKG7DAgAAMAPm5uZy4bXXT7oYm+TktWsSt2EBAABgUgRQ\nAAAAmhBAAQAAaEIABQAAoAkBFAAAgCYEUAAAAJoQQAEAAGhCAAUAAKAJARQAAIAmBFAAAACaEEAB\nAABoQgAFAACgCQEUAACAJgRQAAAAmhBAAQAAaEIABQAAoAkBFAAAgCYEUAAAAJoQQAEAAGhCAAUA\nAKAJARQAAIAmBFAAAACaEEABAABoQgAFAACgCQEUAACAJgRQAAAAmhBAAQAAaEIABQAAoAkBFAAA\ngCYEUAAAAJoQQAEAAGhCAAUAAKAJARQAAIAmBFAAAACaEEABAABoQgAFAACgCQEUAACAJgRQAAAA\nmhBAAQAAaEIABQAAoAkBFAAAgCYEUAAAAJoQQAEAAGhCAAUAAKAJARQAAIAmBFAAAACaEEABAABo\nQgAFAACgCQEUAACAJgRQAAAAmhBAAQAAaEIABQAAoAkBFAAAgCYEUAAAAJoQQAEAAGhCAAUAAKAJ\nARQAAIAmBFAAAACaEEABAABoQgAFAACgCQEUAACAJgRQAAAAmhBAAQAAaEIABQAAoAkBFAAAgCYE\nUAAAAJoQQAEAAGhCAAUAAKAJARQAAIAmBFAAAACaEEABAABoQgAFAACgCQEUAACAJgRQAAAAmhBA\nAQAAaEIABQAAoAkBFAAAgCYEUAAAAJoQQAEAAGhCAAUAAKAJARQAAIAmBFAAAACaEEABAABoQgAF\nAACgCQEUAACAJgRQAAAAmhBAAQAAaEIABQAAoAkBFAAAgCYEUAAAAJoQQAEAAGhCAAUAAKAJARQA\nAIAmBFAAAACaEEABAABoQgAFAACgCQEUAACAJgRQAAAAmhBAAQAAaEIABQAAoAkBFAAAgCYEUAAA\nAJoQQAEAAGhCAAUAAKAJARQAAIAmBFAAAACaEEABAABoQgAFAACgCQEUAACAJgRQAAAAmhBAAQAA\naEIABQAAoAkBFAAAgCYEUAAAAJoQQAEAAGhCAAUAAKAJARQAAIAmBFAAAACaEEABAABoQgAFAACg\nCQEUAACAJgRQAAAAmtiUAPreJDckuXroufVJrkvy/cHXCUM/e1GSnya5NslxYyklAAAAM29TAuj7\nkhw/8txNSc5KcsTg63OD5w9Ocsrg3+OTvH0T3wMAAIAt3KaEw8uTbFjg+bkFnntwkvOS/EeSXyT5\nWZIjN7dwAAAAbDkWMzr5zCRXJTknyU6D59akpuZ2rkuyxyLeAwAAgC3Eis38vXckecXg+1cmeWOS\nJ2/ktTct9OT69et/+/26deuybt26zSwKAAAAk3TNt67Ij6684hZft9A02oXsk+RTSQ69hZ+9cPDc\nmYN/L0nysiTfGvmdm266acFcCgAAwALm5uZy4bXXT7oYm+TktWuSBfLm5k7B3X3o+4emXyH34iSn\nJtk6yb5JDkxy5Wa+BwAAAFuQTZmCe16S+yTZOcnfp0Y01yU5PDW99udJnjZ47Y+TXDD491dJTs9G\npuACAABw27KpU3DHzRRcAACAW+G2PAUXAAAAbhUBFAAAgCYEUAAAAJoQQAEAAGhCAAUAAKAJARQA\nAIAmBFAAAACaEEABAABoQgAFAACgCQEUAACAJgRQAAAAmhBAAQAAaEIABQAAoAkBFAAAgCYEUAAA\nAJoQQAEAAGhCAAUAAKAJARQAAIAmBFAAAACaEEABAABoQgAFAACgCQEUAACAJgRQAAAAmhBAAQAA\naEIABQAAoAkBFAAAgCYEUAAAAJoQQAEAAGhCAAUAAKAJARQAAIAmBFAAAACaEEABAABoQgAFAACg\nCQEUAACAJgRQAAAAmhBAAQAAaEIABQAAoAkBFAAAgCYEUAAAAJoQQAEAAGhCAAUAAKAJARQAAIAm\nBFAAAACaEEABAABoQgAFAACgCQEUAACAJgRQAAAAmhBAAQAAaEIABQAAoAkBFAAAgCYEUAAAAJoQ\nQAEAAGhCAAUAAKAJARQAAIAmBFAAAACaEEABAABoQgAFAACgCQEUAACAJgRQAAAAmhBAAQAAaEIA\nBQAAoAkBFAAAgCYEUAAAAJoQQAEAAGhCAAUAAKAJARQAAIAmBFAAAACaEEABAABoQgAFAACgCQEU\nAACAJgRQAAAAmhBAAQAAaEIABQAAoAkBFAAAgCYEUAAAAJoQQAEAAGhCAAUAAKAJARQAAIAmBFAA\nAACaEEABAABoQgAFAACgCQEUAACAJgRQAAAAmhBAAQAAaEIABQAAoAkBFAAAgCYEUAAAAJoQQAEA\nAGhCAAUAAKAJARQAAIAmBFAAAACaEEABAABoQgAFAACgCQEUAACAJgRQAAAAmhBAAQAAaEIABQAA\noAkBFAAAgCYEUAAAAJoQQAEAAGhCAAUAAKAJARQAAIAmBFAAAACaEEABAABoQgAFAACgCQEUAACA\nJgRQAAAAmhBAAQAAaEIABQAAoAkBFAAAgCYEUAAAAJoQQAEAAGhCAAUAAKAJARQAAIAmBFAAAACa\nEEABAABoQgAFAACgCQEUAACAJgRQAAAAmhBAAQAAaEIABQAAoIkVky4AAADAJK1avTo3btgw6WLc\nJgigAADAbdqNGzbkwmuvn3QxbtHJa9dMugiLZgouAAAATQigAAAANCGAAgAA0IQACgAAQBMCKAAA\nAE0IoAAAADQhgAIAANDEpgTQ9ya5IcnVQ8+tTnJpkr9L8oUkOw397EVJfprk2iTHjaeYAAAAzLpN\nCaDvS3L8yHMvTAXQg5J8cfA4SQ5Ocsrg3+OTvH0T3wMAAIAt3KaEw8uTbBh57qQkHxh8/4EkDxl8\n/+Ak5yX5jyS/SPKzJEcuupQAAADMvM0dndw1NS03g393HXy/Jsl1Q6+7Lskem/keAAAAbEHGMT32\npsHX7/o5AAAAt3ErNvP3bkiyW5J/TLJ7kn8aPP8PSfYaet2eg+duZv369b/9ft26dVm3bt1mFgUA\nAIBJuuZbV+RHV15xi6+b28T/b58kn0py6ODx65L8a5LXphYg2mnw78FJPpy67nOPJJclOSA3HwW9\n6aabDIwCAACTNzc3lwuvvX7SxbhFJ69dMxPlTKqsWSBvbsoI6HlJ7pNk5yR/n+Qvk5yZ5IIkT04t\nNvSIwWt/PHj+x0l+leT0mIILAABANi2APnIjz99/I8+/ZvAFAAAAv+UenQAAADQhgAIAANCEAAoA\nAEATAigAAABNCKAAAAA0IYACAADQhAAKAABAEwIoAAAATQigAAAANCGAAgAA0MSKSRcAAADY8qxa\nvTo3btgw6WIwZQRQAABg7G7csCEXXnv9pIuxSU5eu2bSRbjNMAUXAACAJgRQAAAAmhBAAQAAaEIA\nBQAAoAkBFAAAgCYEUAAAAJoQQAEAAGhCAAUAAKAJARQAAIAmBFAAAACaEEABAABoQgAFAACgCQEU\nAACAJgRQAAAAmhBAAQAAaEIABQAAoAkBFAAAgCYEUAAAAJoQQAEAAGhCAAUAAKAJARQAAIAmBFAA\nAACaEEABAABoQgAFAACgCQEUAACAJgRQAAAAmhBAAQAAaEIABQAAoAkBFAAAgCYEUAAAAJoQQAEA\nAGhCAAUAAKAJARQAAIAmBFAAAACaEEABAABoQgAFAACgCQEUAACAJgRQAAAAmhBAAQAAaEIABQAA\noAkBFAAAgCYEUAAAAJpYMekCAAAAm27V6tW5ccOGSRcDNosACgAAM+TGDRty4bXXT7oYt+jktWsm\nXQSmkCm4AAAANCGAAgAA0IQACgAAQBMCKAAAAE0IoAAAADQhgAIAANCEAAoAAEATAigAAABNCKAA\nAAA0IYACAADQhAAKAABAEwIoAAAATQigAAAANCGAAgAA0IQACgAAQBMCKAAAAE0IoAAAADQhgAIA\nANCEAAoAAEATAigAAABNCKAAAAA0IYACAADQhAAKAABAEwIoAAAATQigAAAANCGAAgAA0IQACgAA\nQBMCKAAAAE0IoAAAADQhgAIAANCEAAoAAEATAigAAABNrJh0AQAAYNJWrV6dGzdsmHQxYIsngAIA\ncJt344YNufDa6yddjE1y8to1ky4CbDYBFACAJWNkERgmgAIAsGRmZWTRqCK0YREiAAAAmhBAAQAA\naEIABQAAoAkBFAAAgCYEUAAAAJoQQAEAAGhCAAUAAKAJARQAAIAmBFAAAACaEEABAABoQgAFAACg\nCQEUAACAJgRQAAAAmhBAAQAAaEIABQAAoAkBFAAAgCYEUAAAAJoQQAEAAGhCAAUAAKAJARQAAIAm\nBFAAAACaEEABAABoQgAFAACgCQEUAACAJgRQAAAAmhBAAQAAaEIABQAAoAkBFAAAgCYEUAAAAJoQ\nQAEAAGhCAAUAAKAJARQAAIAmBFAAAACaEEABAABoQgAFAACgCQEUAACAJgRQAAAAmhBAAQAAaEIA\nBQAAoIkVi/z9XyT59yS/TvIfSY5MsjrJ+UnuNPj5I5LcuMj3AQAAYMYtdgT0piTrkhyRCp9J8sIk\nlyY5KMkXB48BABiTVatXZ25ubia+AIYtdgQ0SUZrlpOS3Gfw/QeSfCVCKADA2Ny4YUMuvPb6SRdj\nk5y8ds2kiwBMkXGMgF6W5DtJnjJ4btckNwy+v2HwGAAAgNu4xY6AHpPkl0l2SU27vXbk5zcNvm5m\n/fr1v/1+3bp1Wbdu3SKLAgAAwCRc860r8qMrr7jF1y02gP5y8O8/J7kodR3oDUl2S/KPSXZP8k8L\n/eJwAAUAAGB2HXKvo3PIvY7+7eML3nbWgq9bzBTc7ZLcfvD99kmOS3J1kouTPH7w/OOTfGIR7wEA\nAMAWYjEjoLumRj27/+dDSb6Quh70giRPTn8bFgAAAG7jFhNAf57k8AWe/7ck91/E/wsAAMAWaLGr\n4AIAAMAmEUABAABoQgAFAACgCQEUAACAJgRQAAAAmhBAAQAAaEIABQAAoAkBFAAAgCYEUAAAAJpY\nMekCAABMi1WrV+fGDRsmXQyALZYACgAwcOOGDbnw2usnXYxbdPLaNZMuAsBmMQUXAACAJgRQAAAA\nmhBAAQAAaEIABQAAoAkBFAAAgCYEUAAAAJoQQAEAAGhCAAUAAKAJARQAAIAmBFAAAACaEEABAABo\nQgAFAACgCQEUAACAJgRQAAAAmhBAAQAAaEIABQAAoAkBFAAAgCYEUAAAAJoQQAEAAGhCAAUAAKAJ\nARQAAIAmBFAAAACaEEABAABoQgAFAACgCQEUAACAJgRQAAAAmhBAAQAAaEIABQAAoAkBFAAAgCYE\nUAAAAJoQQAEAAGhCAAUAAKAJARQAAIAmBFAAAACaWDHpAgAAW7ZVq1fnxg0bJl0MAKaAAAoALKkb\nN2zIhddeP+libJKT166ZdBEAtmim4AIAANCEEVAAmEGmtQIwiwRQAJhBprUCMItMwQUAAKAJARQA\nAIAmBFAAAACaEEABAABoQgAFAACgCQEUAACAJtyGBQCGuL8mACwdARQAhszK/TXdWxOAWWQKLgAA\nAE0YAQVgyZnWCgAkAigADczKtNbE1FYAWEqm4AIAANCEEVCAGWZqKwAwSwRQgBk2K1NbTWsFABJT\ncAEAAGhEAAUAAKAJARQAAIAmBFAAAACaEEABAABoQgAFAACgCQEUAACAJgRQAAAAmhBAAQAAaEIA\nBQAAoAkBFAAAgCYEUAAAAJoQQAEAAGhCAAUAAKAJARQAAIAmBFAAAACaEEABAABoQgAFAACgCQEU\nAACAJgRQAAAAmlgx6QIATJtVq1fnxg0bJl0MAIAtjgAKMOLGDRty4bXXT7oYm+TktWsmXQQAgE1m\nCi4AAABNCKAAAAA0IYACAADQhAAKAABAExYhApqxuiwAwG2bAAo0Myury1pZFgBgaZiCCwAAQBMC\nKAAAAE0IoAAAADThGlCYcRb2AQBgVgigMONmZWGfxOI+AAC3dabgAgAA0IQACgAAQBMCKAAAAE0I\noAAAADRhESKamqUVW5evWJFf/+pXky4GAABsMQRQmpq1FVtnoaxWlgUAYFaYggsAAEATAigAAABN\nCKAAAAA0IYACAADQhAAKAABAEwIoAAAATbgNyxZilu6vCQAA3DYJoFuIWbm/pntWAgDAbZcpuAAA\nADQhgAIAANCEAAoAAEATAigAAABNTGwRorm5uUm99SZbvmJFfv2rX026GAAAAFuEiQXQWVmxdRbK\nmVhdFgAAmH6m4AIAANCEAAoAAEATAigAAABNCKAAAAA0IYACAADQhAAKAABAEwIoAAAATQigAAAA\nNCGAAgAA0IQACgAAQBMCKAAAAE0IoAAAADQhgAIAANDEUgXQ45Ncm+SnSV6wRO8BAADADFmKALo8\nyVtTIfTgJI9McpcleJ95rvnWFUv9FmMzK2WdlXIms1PWWSlnMjtlnZVyJrNT1lkpZzI7ZZ2Vciaz\nU9ZZKWcyO2WdlXIms1PWWSlnMjtlnZVyJrNT1tblXIoAemSSnyX5RZL/SPKRJA9egveZ50dXzsYH\nnMxOWWelnMnslHVWypnMTllnpZzJ7JR1VsqZzE5ZZ6WcyeyUdVbKmcxOWWelnMnslHVWypnMTlln\npZzJ7JS1dTmXIoDukeTvhx5fN3gOAACA27ClCKA3LcH/CQAAwIybW4L/86gk61PXgCbJi5L8Jslr\nh17zgySHLcF7AwAAMHlXJTm8xRutSPJfk+yTZOtU2FzyRYgAAAC4bTohyU9SixG9aMJlAQAAAAAA\nAGDWLJ90AWbEUlz33NI0lv/A2P+4uRWDf6dxnwXGy3HOrJimfXWaysISmcuW2UhePvL9QZMqyJQb\n3U6zaBrLvS7JlUnuPOFyLNY0btvNte2E33/F0Pc7TawUt84kGwE7DH0/q42R7SddgM2wLLW9p2Wb\nz2odtKW2rabBXJbmbhWTNOl9ZXh7TrIsXf0zq7bJIuv9LW3H3pi51O1hfp1k7yT3mmxxxurXg3/v\nnuSrSX4/S39QbZPkdkv8HuPWbadHJ3lpZqvB1H2ev05y+yQPS7Ln5IqTpA8ZX0nyX5L8UZLtJlaa\nxev2j/sm2WqSBVmkeyR5yuD730stBNfarwb/3ie1bzx38HgazzfdsTWp24etSfLCwfeHJdlrQuVY\njMOTvGTw/ZGZfAfIplieWp3/pky+EdjVpb9OleX2EyzLrbU8fdvqwNT6H7tNtETjdbckKyf03stS\n2/Y3SQ5Ocr8JlWNcuvr/16k6YlJ3wvjN4N+nJXnMhMrQleOmJEckeVmSPSZYls1xcpI/SLWlT5pw\nWabS6InlMUm+meT0CZRlXBbqbVyf5EvpG55L6aVJvp7k7akG7rQa7jlcnlqV+dNJzsp0l/t3uXeS\nHyb5XJLPJzlm8PykGvbLkzw7yWeT/DzV+TErRnsfH5zkG0nOTfKmJKdOolCL0P0tK5Jcm6rnPpQ2\ngWZ0/7trKni+flCG6zK5RtzvMvr5PzrJLo3ee7gOvyTJj5J8P8mxjd5/sUbPQzck+U6Sz2R2ZuHc\nPnU+eHWqA23S7pnkP6ev12fF8lR9+d+SXJDksiR3mGiJFu/wJG9Mck7ad6wO10vbJnloqj6/f+Ny\nLJU/THJFKgC2aLuMznS4R6rT7yNJ7tjg/Yd1deay1H71tCTnpT7jWbA8/Xa8d5JfJLkmybsy+yO6\nY7Ms83fsZUkekWoUTbLHY5x2Tt1zNUkOTZ38H5Wl3Qn+PMlfD77/eCoETXokbiHDDaPu5DGX5D+S\nnDl4PImRoU01vP92n+c7k1ycGqFLKvh9vXGZhveruVQnxAWpUZxPJ/lgZmPK5fD+sU/qJPSqVFi7\nS+pYOrt9sTbLaGfUQUmuTvLToeeW8qSw0GyLk5K8bujxJ5K8Z/D9tI2C7prqcPhGkvNTDaOlNPr3\n75nkNUk2pEY5plm3Hw1/5itTs4o+leT/HXp+2j7nUfsk+VbqnHZ8qmPv+N/1C2M0HN67bfr+QXmO\na1SGzTXcAJ1LTR//TKrjbu/B8+9O8t72RRubXVOjU2c1ft+F6tLXpOrzRzcuyzgsy83/puelOihO\na1SG4ffv6qTnpW4V+ajB4xahaaH6cK9UJvnQ0HPTGuAWmgp+WKrDbJaP9SV191RKX51kxyQfTvLm\niZZo84wexC9J8r1UD937Un/fy5O8IeOfvrNN+oPimYOvv06Fn3uP+b0Wa/TgfUWSi5I8LvV3PDrJ\nf29dqFtp+CDfffCVJCcm+XZqm3ev+UaS/7TA743b8P63ffopY+9O8qDB9zsmuTw1PXgWrEzty59L\nNfq+nQpN30nyrMFrJn2tyu8yuq8/NtWL2vWsfir9TI8VGb/h9799kuenD1AvTo0cdA5K8r9SU/RG\nf7elhcL6OamOtM43kjxxCd77gJHHD03tc12H6JmpRnyyNJ/XOIxOEXtqamSmu6TlS6nRxGR6prGP\n7mt/mKqrjkqVf88kX0iNQrSYOjpcnh2S7D/4/rhU6FkzeDyN+8Dw8XNg+s7d5yX5WpL9Bo93T/J3\nqRHdWfLwJHcafP/G1LkgaT974wFJnpxkVapD98up+n0a94lNsSbVFk+qg/fzqQ70VufXrZP8ZSrM\nd/vkhanjv/U2fXBq8OaU1CDSKak26s6Ny7G5dkt1Jv9pquN+11SHwn1Tbexk+jsfx254dGYudfL7\nq9RUtHemKsO9Uif9dyQ5euj3ptnodLCVqb/j9YPHD0/y/6SmD61J8sVU5TWOBt7K1LD6J9I3ZF+X\n2qbPGXrdIZmexkZn69Tn/MrU/Pqvpm8YXZM+YExbuTsrUiNw30mFvBMGz78v1dDvTvwnpEZOlmo0\nd/j4WJUa4Tw3Fey3HpTnpPTXA78uyY8zfdexjZ7o7piqND+SvsH3sdx89OtRmb5rnUeP7f1Sn8ul\nqb/pvMHzD0xN6exe33VMLbZu+OPMr5fulxo5vDTJ36YaTqtTo2H7Dl6zZ6rD7KOLfO/FGN6Xjx16\nfFpqCuwRg8cnpvaDcU7NWps6jrv98ImpoHvE0Gu2TdWt3RTcVYN/p6VH/MTMH9U+O9XJsXbouYNT\nn3t3ff0k/4aNTZv8UJInpaa5/u9UcBq+fmnfhX5pDEYXOzoj1QHxrtQ0vKQa5t2o27R0fq3K/JH5\n3VKzXr6WKus9UueCL6SuCevOqS9JjdzNghNSU+Dfk2o3PDZVP9yY6rBI2gSV7VN16SWpBv6lqcD2\n1CQfSPvpoptjdOXzM1JtrvNTM7hWpf6216bvfBmnO6Svf+ZS56IrUrO1/iJ1udDDU5cLXZK+02Tc\nFhoxPCM1U+wBSd6a2ibLBuV4dKbnmO+Mlv+pqWPktal2xpmp0Plnqbbgxn5vi7ZdatSlu65v/9RO\nfkn66YBvS4WRA5O8KLUBp3kjzaVOip9NfyC/I3XifHCqp/ndqRGn4esCnjP4ndWLeN8M3mN9ajvd\nJzXaeWqq9+jC9NOEnpHkqkzuYvJReyT5k1R4fk/qOo53p7ZXV8ZjkvyfTE/4HO082TYV8M4YPPeh\n1Ino7qmVZr84+L77ndMyf5R6HEYrwhWpXrqnD8r7f5M8MjWC+P70+8MLU42Ou42xLIs1fJyvTX+C\nXJ/qWd578Jr7pabh3Tu1/1+Z6gWfpoWVRj+XI1ONwY8MHq9MHY9dnXDu4GdvSfU4j8Mz0o8IPDY1\natO930mp/XXP1Mn+86n9+LLUQkTfyM1HApfSwZl/TeVxqTrz4lTd9sBUmD47869H+lzqs1+MZbn5\nyMmJg39PSdXnx6Ya7aelRo0ekeose1/qmN8mkzVa/q3Tf9brU/vCvVKzILqZD29Nbb9zUtt4Eo5N\nfZ4rU8fMAwbPr0ztjyenOmQuSn9cLE995k/LeMPGzulDTOe+qWMyqZlLP0jtq/ukOhS7DrxpaJA+\nM31HyR2S/E2SJwwefyN1vO+WqjMvSj+CuCLVYJ1226Xahnul6oKfpGaSJfW3/2CJ3nehz/agVGM+\nqf30v6QWeVmRCi6Py/SOgt4+tZ9307B3SgXmT6bfJ96SGv3bOtVpemrGW8fdMbXPHZE6j98jdY4c\nDkdHpT7ooJl/AAAgAElEQVTTFanOn/VjLsNWmd/m6Npl26cGjm6XmjHwvSSPH/zs5FS7/cBMh9FL\nGJM6PjaktldS2/X1qbpg61Rn5FuS/DLVSb3F6zbQXJKHJPluajrQ01Oh6UPpezN3S43M7JcKIOdm\n+hdMWZsaUXjS4PGnU5XWTqlKcriBtF/qJLss4/nwz09dj9L1UN0/1Tu/ItV4uiDVeP9s6vrTSRg9\nQJalRmpfmjqpfC51gfTjh17T9eT+bWo/mOTowuh7H5K+w+SOqQbpx1Ph+ez0ow+vTZV/KUbmRq/n\nfWAqPNwzFSBOTp1QLkrfyfGc1Ojh1amwvybTYXj77peqIL+U2o6HpKbhfTnV+9299qmpxseX0oeF\naTOX6sHttvMZqU6hfQaPn5OqK5al/sZ3pF9ldXNsm74BkcH/+6L0x9LP08+I2GvwXl3w+P3UTJQj\nUp1B70m7BtTKVF3YNeZ3TdWZh6bqta+nRm1un9qv35T++uoDUueQxTgpfd3zwFQD5P+ktsPeqctB\n/ja1rT6Q+gyT6kB7XqZjJdRnpt8OR6bK/pvUeeiE1ArYb0+V/1Op6ymXpaZg/0XjsiZ9YN4rVcY1\nqc/7utRnkFQ99uHB90enLsl4c6pB+N6Mb7vvkfrMu5HtE1Lh7W6pjoZzU5eyfCPzO0nenhoZnaRl\nmX+cHph+++2c2qaXp2ZHvTV9aPpU6rOfls7djblDKix1IfBjqfPcN9Mv5Nj97F/St8GWwp8MyrJV\nan+8PrVPvDvzZ5o8LHW+arVI2q21bWpksxsYeVyqU+Kzg59359hfpIL241Jt9HGEruG24BtTMx5/\nljrvHJrkH9KHzG1T7dsjU+2aj6afqbFYO6U63p4weHx6apSwu4zqy0n+LTUbr2s/dRnl/PQdZZM0\n3DFyr9T5/MjB4xemOuY7T07VnQekMtaTsvjz5kwY3khbpULHP6V6NJI6SD+d2hjda89Pv2TwtE0R\nTOZf3J9Ur8KfpBr790y/+M826Reg2T/V+PtJFnfd0r6pE8mzBv/n7qlK5Mj0J6Lh62e3z/Tc9/GY\n9NPAnpI6KSbVMH9j+kbJK1KjddOwGufw/rttKmD+19S0hu662semXzDp4anP+JRUA+m+Gb+jU43w\npD7fZ6camHdPBbgfpEZfh1eL7E4ee2Z6Rj1He5Z3TfWA/klqX/5uKgjtmAoI52V6QvMteXBqOtHZ\nqRGyx6b2/Tdn/iIVV6fCQ7L4UZQ7p46ju6bC5KpUw6ELmQ9NNZo6x6Q6Tk4ePN4+FUx+mGqgLLXR\nv/d+6bfFjqmG3NVJXpDaD16QOoe8IxX8FlM/DC8uc8fUdNQrUw3zpJba774fbqSfnL5DcdJTbod7\nv/8wtXDbN9PXRZ9MnSuS+WFtfdpcj76QrVLnyG6kblUq5L0zdWw/KNUwfFHqc7kk/UjNvqkO1sPH\nWJ651DHadeB+KTXdtztGj09N+Vw/9DsHpZ8O+OQxlmUxjkjVkWelwnN3nn1V+r/ldanO6nuk/oZD\nGpfx1npIarXey1NhadvUtYGfTD91c6v0QXS/LM1I9P7pFz57Raou2i/Vcf7Sodf9cfpAM9wROA1G\nt8urkvx/6euHJPnHzL8W+F2pY227LH4QaKEFhp6Wqt9PGfrZuan6PamO+0+mP/7HoRsMWJa6pv/j\nqc/zw4P3Pjd1XHSrGXeOT7VJd81k26XLUqOX3f6/depSr++ktucXU+Fy51Rd9oTB6w5MfdbPzuTP\nW81tl6oUX5s6MB+QCmtdpX96qqG0PjVUfGWmdw798Id3bKoxf7vU3/jWVA/Ol1J/Z9eT86rU1OKP\nZ3GrJx6Z6qF9aWpbfSU1mvJnqe3b9Q7dOdWzNKkeuNE59YekrjN5fuqgXpkq20WpAL1fajTri6ke\n5fMyf77/JKZgjx6kj0w1jl48ePyy1P68V6oz4HuD51+Z+VNdx1me0ZNIdwuAs1Ofd+eDqRP1joPH\n70zte5OeJrgxj071siZ13N8z1VD6aGqUoZse9uXUVMJpm5K/UKPnjFSP7qrUVNv3pcr97PQjjUl9\nhouZmTDaGfbVVAdfN9X3oFQ9sW7w+IvpA+nqVHDpTmb7pEadWtyWYbjMXW/yE1PbqbtE4+z0nTxv\nSXX8HJjqxR3X/Sv3S4Xez2R+j3FSQfwRg+8PSx1D30w/wjQpC9UFD0qNWFww9NyuSf49fefT/VKB\n7tJUJ0Vr3XF7Sup8dUnqvHBkKtSflvrb9k1Nvf5IaqRzKRYcGh45nEvVOXdN1fO/TF+33jN1vJw9\neN2fpqZbTnLq2vCq69unOrY/mNovd0wF+K6D4WOpOjSp89JZ6ReZmVanpMr/5+nbgVekQsBDU8Ho\nDakOqitTf++2mb9dNtdCsz4ekf7a44tSwXO/9AH5Mantf3Xm3/9zGhv63Qj+7qlRshenrx+emQoy\nx6VmRVyZ8d854ZGpz+uoVHh6eKrd3AXfXVJt3DenOtLfmvpsx7Et75j+Hsjbp859r04fNOcG7/vs\n1KyI96U6IS9OrSD7kDGUYTG6jtBLUvtbUsf761KLpD0glT+6DqeHpD7P7vcOyXTuk2M1PN02qQbY\nl1MjNl3DZufURhuemnpU6hqLV2f6FhMZtVeqYr88FUQ+M3j+Pqnw2V33+fnUiXZNFtcz193C5amp\nSjmp7fys1El661Sj4uHpK9BpuMF4N3XhCekbvW9JfcbrU/tAF5LmUg3L4ZPjJILG6HveM7VtL01d\noP83g+d/L/11SNumeup+kOolHXcDflnqxNutYnuPVGPpX1Mnkjuneu4eOfj5PqlR+ItSo4hvTR8y\nJmm08tszdex8IfU3LU9Vlm9K38B7dypUrU2FgJbXJd6S4X1ll9Ro98rUfv/Z1LTNb2f+9VX7pU5s\nT8riTwbD779r+oD7vcyfLnVG+uXX16amZo5rKtOtsX/mX6d7VCoQXzYozz6pemH94OffSTVGfz/V\neH5xFtcbPnw7jWWpRtbb0i/cdk3mT+c+JbU41Hapxso03JN6eJ+5Q2qbdMfK7qnAOTz6cmZqEZqk\nRoqWYtXgW+u4JL9K1Znd3/O4VOOv63DYKxX4f5Px3qt0NLzfb/Cel6ZGt5K63u3nQ6/ZOzVK8olU\n+JjUpSwLtSEOSE0XP3PouVNSdczBqfrmK6nQPO3raeydqr+6Ec8b04+8nZYapNg7Vbe9LNVRPa6p\nkKPB837pw+/LU51R/zn9AomdP0q1yV6c6TJ6K7aHpYLWhalBnqNS2/LtmX9P+v+UOv9+KP2lIptj\n9DjbefB/XpgK7B9OzR7YOhVIh8uwd6rD9OiMx3A53peq57+U2nful5qx1s2qODE1Anvv9G3SR2Ty\nhj/Leyb5H6nje2Vqm/5dqvOxm+14+9S2/Uy2nNtZ3qKFrie4e+bfe6bbkPdKNTxfkmogT8tFvaMW\nqvQfkv5ag5ekTpKPHTx+UfpFC1an74ncHHdO9b78LFXB/W362xBslTo4PpUKQM9JjbROyzUdJ6ZG\nfg5I9cx2iwWsSlUsV6amOq3byO+3XtRhoVGF41Pb+1WDxw9IXRfQncQflwqk90qNLu6zBGXqHJKa\nPnlFagRtm1QA7gLx6anGdHcNw/JU42PcZdocC10on1Sg/tACz389tRR7Uh0sZ2YyIzYbM/q3HJO6\nl+dlqbrsdqk6YPj6sLlU3bBVqgG7mPA5/Lt7pMLZZ1INoR1TPdnvTr8vbJdqhHYn+Xts5O9YSjum\nRpG6a6eXp0YyRu+Zd3yqQXKXVKB/f2rbLqYRMLqq6Q6pk/NvMv9WX09MjbwPuzR9x88kje4vD0mF\n97emzg9dsHxTqod82C8zfwXcVkYbwWtSsxr2S3VKvjr9NUu7pmaPvDD9zJ1ds3Sj8XunZg59MrVt\nDk01SrvyfD/Vwbt9+kWbNnfBwHF7SKrsXTBfn7qMadibB89vl9qe0zqjrLMsdax9IX1ZX5Z+OmZS\nbZ2FrpFf6Ny9qY4deXz/VNvka4OvnVMdJldk/jVzT0q/rwybhoWHhrfFnVLtk1en2mK/lzovfTJ1\nLnpCqs44LXV+2i2Lb3sNn1d2GJRhdfpF0V6RCn0Xpdo161Lnq2+kzlPjvM3JcP2zVyqk/TJ9u3N1\nqoN2eD87O9XGmoZreIe35d6pNsbZqdH2zw2e/5tUeTv3Tm3jrTId6xMsqdWZHyL2S50U/zRVsR+U\nuv3IqJWpE/vnU9dLTZvRg/DhqV6jroK5U+qEdWaqx/RfB88fnH5EZzEel+qpeUBqJ/qr1NTb69Kf\nEO+dhRvwLY02Yu+WCpkPSPVgvio1enB15oeh+6Qal9O2+t7OqZP3w1P79ltSvYQ7DH5+cfrQt0fq\nM1mKlYVHT2R/lJpi94Wh5/ZKNZSOTDXu3p46iUyTx2R+yHh45l8/dVYqTK9M/zffPXXy/0n6IDot\nRnuV/zx1AlubOk7fMHi8S2oU5Rmp4NfdxmGxI9HD9dKKVE/yo1Lb9JuphsYO6Xt4t08F0RMzmRuk\nj3Y+HJvaNjumFpXprvHvRkZ3SV3f+d70+8W4HJ1qSJ6bWmjmGal9LOk/16+kGmRXpfbdSXfoLdR5\nc2JqdLi7XvbkVLDrAsm/pMLH5alg1Xo2zGhY7j7DO6Q6F14yeM3rU9f1dn/fQwc/H/d186Pb76hU\nyPhN5rc9XpO+bj8idcxek9pPJmE0WO2eOt9/PnU5y2WpjrmtUvvx8GUfx6ZGPKclNG/MU9JPab9v\nqvPxrqm/+9DUfv3Qwc+PT30Ww/vXYsLSNqmQu0OqLn1iar/opl9/NBX0D0+d57+WOg9/LjUrZ3im\nwWgn1yQMb4uVqQ6Uz6TOp7dPXSN8dWqw4sOp2VtbpcL0d9PXJ+PyklSHXjcCt1Wqk/yvU/X+e9Lf\npvCAVHtgKerbo1Lnxg+kzs3PTn8LtBWpz/dj6dvVd8l0XcPb1V9PTH/P7junBiROSNVV56T+hrek\nZuI9vXEZJ2IutdO+M9WzsjbVWz18jeJBqd6N4Z7m9akPeRqNXr+4InXAfD/1AXc9JfdNHUBJHdy/\nTn9fsHH04Nw9dQuN7sbh909VlhekpoK8K3Vy7BYLaV35LdTreFCqd6arVD6aOqGflFrIYbT38nXp\ne+4nMTVoeEpeUh0pP0vtn10v7Mmpv6lb0Ge3VMNlqfbf0YDxZ6ke711SleK300/JTur460Lpw9Pf\nh3TSuvudPiF1wr5/6gR+eaq3/sRUIPhw5k+rfWBqBHHXTE/jaaGpw29MNQTfnuSGVP23PNXx9OnU\nsXBY6mT/ocy/BdOttW/6RTiSami+JLXdPpjqzPlq5l8v99jB+16fOim1rh82Nur97dTJcevUNjx9\n5HdWphoET854g9OJqf3vHqn97l8G/16XfvZKUsfYM9JPZ5+k0evoH5rqaNo+Vf+/a+jn706/IMrv\npzpGhxcha2W04+xh6Ufn5lLb/yup4+MPUg3Sp6fODUelv/53HEb3+W5K34NT+913089WSiogX5Z+\nUZR903c8TtL2qfPQoanO7qQ6af41dQxtneqw/lYmH4I21apUO+Hy1GUr3b57Tmq214rU39XdY3Oc\nI4vD/1fXhl2ZOv9fk362xT1THc5/kDoWn5ZqCw5PF50Gu2X+dNU1qW32kfTng21T+3o31fQDqU6L\nbh2WxW7fuZF/T02NcO469JpdM3+GyQWDco7e+micdktNnT8pFdrOS52Tf5j5bbrTU3XopHXtv67u\nPzx1jM+l9r1nDb32cak7hiR1PD02VS9M+yWMY9HtsHukKpJnpk7czx08v2zwXHdN3A9SQfV7Wbpb\nU4zTrqnK8OXpR5VWpaZg3jXV0D83dYI9K9V7+rCb/zeL8vr0i4msSFXGZ6au+Tkh410dbFONhs79\nU43wx6ZOGNukDvgXp06ap6fC9D+nAumO6Q+u72cytwAY7WTovDL9lOm9UqPZy1KjcH+Zfns/PtUb\nPc6T/eh0tbulGuuvSFWMf5M61p6bfrn0pDo7vpa+13bSlqUawPsMPff+1MnoUYPHD0udAOZS+/NH\nUxXnZalAOk3TxUb39+1SjaXh1WQ/kWqczA1+/rz0K6guRrc/7JPaLt19EF+WflGMawZfwytadtdN\nHZSbN+hbNFBHr1d5S/qT/QNTgeTgVKC6JBUOfy81pfQ5WRpbpzo0HpMaPfxqarT6KZn/WU7aslSZ\nuqlTy1Of9/dSx/7PUp1fJ6Tq3eMHr7tHqmNyWuqBp6U+861TndLd6Nx2qQ6+bvXNB6VG65e6k+Tk\nVKfDg1Odum9KdRD9r8xfaOW5qVkEk5pKOXpeOiUVFrqV9XdItTvelApGX0l1PCZ1vpj2Wyvsn7oO\n75Xpr7k9OtUR8aDBz7+YflGaPdLPMFrsIkOjdXnXufDFQXmSauu9If1MlVelzl8LTcec9HTbbjvs\nkRrgeWUq4B2UKvcX04/k3T51HfADU6Pj703NPljMAl+jx+sB6c89r0g/EDF8v83LUx2jnxv8u3+W\n1l6pUd9urZH7pdqmL0wNirw4VY/uu+Bvt7VQ/dfdpnJd6rP72dDPjkgNfL3q5r9223BMqofostRG\nOjcLX6O4Y+oAPjr96o/TpDs4ugrq2amD58xUL80l6acNviL9iNMTU/PWRy9MH5c7piqUrpFxz1QD\nbfeN/sbSOiC1oFTSL+Lx3VSweF/6sLxnqmfr4vQdEs9JvyR0UkG19dSmrUce3zXVsD998P0fphqj\nH0+NdN+YKvPdUiNNLVY+XJsKmOek74m9fPD+q1Mnku+mpledmxpZm4bOnJWphtBc+nt0XpSq6O+R\nmtp0avpjrOukSOpveG2mY4GUYcMNnUenP7kekJqi2TX8jkvVEd0K13dOdVKMdips7nvvktpGF6fq\nofNTIWR5qqF/6dDr3p9qzO049PuLuUbq1hj+W7dL1Q+fStWlV6TvoOtus7FNqu74QGqfflKW1v6p\nz6kbWf+31Gf440xmevJCdsn8hVV2TjUwbpe65OJ/puqhO6ZmR7wp/fH/oLSfMjy6fx+cmkb50fSd\nIqelzpOdZ6RWNO5WlRxemGqctk7VoXdLNTB/njpPHZMa/Tk0Fe4vW6L3vzUWmu1xu9SKoMMrG++d\n6kDpfC91Dtsl07Hw4O9yaip4/FmqvdiNhu2QmvHwjtTf8PpU4F6qffmI1Gy2rw8eH5YKKXdK1atv\nT78q++7p71femYaFnEbD7wWp+qzrpNw51VZ7YPrt+ITU/v+TLG6m1MaC91NSt7NK6hj/i8y/hntF\nKtg/Ie0WdNspFc5PGnruO6l66rmpfW6PRmVZyPL0C68ltW+9IP1o7E6pY+P9g8efT/09908NfL0i\nk5npMnF7piruB6UazZ9ITQv776n7+CW1Yc+dSOk2zcamif0m/VTbY1KN6eFpWv8z/Y3Ll7rSf1r6\nYfZpcFX6yuv49Kt/fjtVAXbXa3S9swuNLkyiAn90Khh1jbXjU2U+NTXqeU2qclybfgp1d0+6ZGmu\n8xz1uFSD43GphvmPU5Xl40ded0Sq0T4tjeakwvv5qTD2wNT08QvS9yS/cfDVBaO7pI6jaVrVNqkT\n0/BS+gemOiS+lup46np1T0s1srtOjQ+mRnHG3Wh6UqqReUJqlsWXU6FuuBFwbqpBd00qyLcOIaPv\ntzw1crMh/bXwz0idOPdPbdOvpJ96tSJt6oTufn57p0aVv5tqhIxz0YvNMXxLkKQau2el6py7pVY5\n/Grqeq7he2Aemfrsx33Lp02xsU6Np2T+5Tadr6W/Ufo7UzNKlrozenmq4/NjqYD30tS05YekAsfr\nU9v+nzP/tl+tPSy1TZIasfmL9FP2T838NTRWpkLc21KN0gsy3vuiLpXDUwvfdPf+3j4VhNYNHh+S\nmuXTTYcdx+Ipo50j26dfyfiM1IrRXef+61Mz85KaEn5+pv+e01unOvKelJpl9r5U+6s7xz4zVT8M\nB6zFdlbvnTq+u87E4zJ/IafvptqsazP/soDTBuWbxOymZ6RGfO+Vmtn4hUxmUbZRXZ3/ptQaL69I\ndcrfKbVvdp/bXVP1VrcuyZ+mRreH70F7m9MNbXfTGP4o1fh5W2q0Y/QaxWnW3Yesuxj+0alrgzrP\nTo2Gdo2pYzP/hr1LaWUqOC1mNGWc9sn8aQCPSPUk3jFVyVyb/n6Td0p/cuzKP4lrVZOqpM9Jv3Ld\ns1KV9rGpSvOv0lcIO6d66H6Ym4e/pXT31O0J7pY6MV48VN7lmb8PToPhDpzbpfbTt6U+8+enpgR1\nDbvdU5Xm8ekbro9PnRCmYb9Oar/94/QL4xyfGjHrVpjbO3Vi2CvVM/n+9LdF2jeLu1n3aGfYqlSD\n/bvpG0JHpQLov6emMH4kFfQOTDU6hhtMLUY890mNIHXXQz8oNXq3Q6reGl5BeufUqEbXQ/9XqWOs\n5WffTZ/+Tqoj7aG/++VNDP/9+6aO921SDeLXp+qkLih37pv+WqBJL5Zxu9T5suuge/Hga6vUPtDt\nh4emPvMfZf51TC2ckapLT001jndJtV2uSG3fpRqB3ZjuM39u6rx5TKrz6g2pUcE3Dcr39NQUyQ+k\nn666LLWfvCVVx06z+6RGc7qZIaemAmB3G5s/T50Tktom6zL/sqLFdEot9Lt7pjrqun3ytNRIfFL1\n0w9Sdf4dUh3ot/T/tXT3VJDqbp91dOpYOjv99NHHpNo4w+ehr6X2o3GtbLtN6nM7K3XsnJnab7uO\n5Humzk8rU22V81MzdC7L5NZ92SaVQz6datM9bULl6Iy2gx+Rmkr7ofSf7zvS394xqQ7uL6ffhqOz\n+W5zdko1KIZXkftOqvI8NtWQm8Q1irekO5C6neD01EjYE1I7wGsGP786NVUkqd6596R6dyddEU2D\nc1InlqQalN0tVh6QGj0eXcBjEttstEE/l2q0P3/w/Rmp28Ccl/7eozukGlRPTPWSjnNBjE11dqq3\nbi7VuP9aambBt1NTraZlYZ7RxZKSCp5vTgXLlalK80lDr31eartOw/Lmw0ZPzutSYXqbVA/yG9Kf\nGN6a6slPaj//RBbfWz/8/t01OStS2+6//f/tnXuwVeV5h58DHDhwQKMQELkkXlKIVG0hyMUbQaaO\nYoKI1U4VpRUjNkdDrCUtI2qJV4bGxiTGCiFyrLWMsQSpUSO1UC9JKmoSjZM2tk2dpqaZzDhtk3Q6\nnbH949lfv7U35yBwztlr7eP7zDiyb7D22mt933v9veRSxrFoTDxYe/0c3OBPKHx+KAN/v6XN8z3o\nBH+cLA3/ELlFIfV7pmjzhZg1Ppnmj1kqspDyN/Di92/Dfef7aCgNxXvpUcwSXoLX2bra+8pSZi3u\nnWDw40Vcp17APssuXAOKe/855EqIslSF78Ue2TQSbiblBfNSZiON92knK2WnsuRz0WFfitfCd6jv\nU6uyHTIS1/7d+L0ex2vgMPxONxXe+wb7VvL0NShV/PxKPJcd6By9VHs+KTP/kBwQuYYcIKsK6buM\nx/U23Udr0C5Ir02uvfdzaLdejlnK0+nfPstJGAx5BNf3CfhbX0hORu0ll4yOpBqj4MAAbdl9u0Xm\nY0XOH+Cevrnw2hDsWf8o7pcPYmXEZIL/pwsX9NloVD6JJ3PC/j5UEj0t2CMxgjMOb6Z/INemn4Yq\nc8lQmUV1MjVlMwrLbUdg9OYBvEEepb5UuSyKv/UE8oZ/KjpxZ2EU9jk0RMBFcitGbcvc3MejIZL6\nAs7GiGN/jyboDzoxErqBLH6RhqBPxuDUJlxEr8dsaBVmKiZ6qioYio5fN2Y6F+N3mFN4z9vksscO\nDp3iddaJTtxezBwvwPVpI/VDzmdgNPk3KIdGx3ERGiPFWc9vkNswbsWseKIvWeLBQGP0eyL+pg9R\n3y81EveirbXHc9ABvYPyAzipjG8dGkdHYqBkPQZq7sPAyCfQ6bub8tVkJ2KF1pv072ifg+Vocp8c\neK6S8vdOrMQCndKN5CD4Zlqj5G452oPJkb4Gr42Xce04CwPYSU/hlxr/gkOkMeA8C6/H5/He2lB7\nPqlwJ7rRCa1i/2yjs3QmHu80dF7ewvtsCwp9fRIDmOsxO3oxfaNxrV+GLXbr0Hbqxsqb5bjGp0D+\nXXiNl93aUGUWYTCk2D7xA+r7/5ehvfp39G0O9qAlpbafQoWtZpYrHiiNRuYSvFFT6dKfAT/CRSpF\n6tONs4vmlwu1CqvIEvYLcNGZUni9zLEq6c93oUG/mez43IhG8bHY3P1q7fVvkyOKZbOKavX9wr4j\na+bgAroGnY0kpz4GjeSUIV+JKpfdlD9TsUhxTTgDBZMuwVKiw/AaSQbf3bhmpBLX8+lbOVGjEzIa\nMzTrMMr9Zcx8t+EGtYns4HeigVd0fMvIJq4i98Mmsbb31x5fTC5tm4UZ0aqO3SqL+di60IVK639R\ne34kee08DvsXU/ltGQHQxn/zd9HYfC/e04+hk1k0do9AZeON9L8yfF/ooBoZkE1kxzLN8wTP1Say\nkvF1ZIe0ig5SkZloS21A+6kDv8sjeD18nax4v548G7KvyrZQv/514Pr9NnmG9EwUFroQq0m+RxZI\nuxkztMUsbNnZ5cb1PAU/x6DznhzoWVhpMAJL97+Cx94f1R3F+z5V4HyBLLw3F7PZXbiv34zVLv9U\nO76yA05V5zS0Oxejo7kYf7/t6NRfi3t92ZU6LcFUqmVcwr6jNoagg/w8Royewajo72MUMkVF52Am\nYjTV+05Vog1VYj/Q8HwZvaqNBv2x5NEpoKG0C42+iZixTQvpVFwAqhStq1Lfb29iIyfgeX4f9vHs\nwY28HbO128jRvMN7+HwZtFMfJBmB9//zWOq2Ho2oYRhU+TJu8r+KysRFtbpDoTFKvxjL0U7FczS5\n9u88hOVryQH+NAp0NBrPzRypUiy73YllyGn23Gy8x4r9va+Qsw7FzF5g9PtFsubAFOzvKZZYplLs\nxZQzWqN4zw8ht9pcSXaetmJJdWIsZhnDaNo/qYIoOZV/hcHQoRjseg0ryX5AvXpnFRmD1+rT5AkI\n4PKEbe0AAAv1SURBVD57Hzlo96fAf+E+OxA9t8Pw2tuB69Ee8iisMbjfJyX5aRhIuaR2LFtqz5VN\noxDWGRhA/y4Gx4/ADPID1I9bSqOlVvXz8cwln8dLcX9PLWrt5HFjKbg4j1xRFuyfsRhc2oH7+2YM\n1nwN14M0PSRocVIkbis29iZD6qtYajcfL4TdtddfpjWEk6pAUjTrjyhmfzAfnYkuzGaPx8zMbsyG\nbsSN6ip0NAZ6BtVg4hh0Km8kl39OwsXyUlxQnyE3+K+ifjZl2bTh5ngl9n5dhMd3au21FWj47SFn\nnNZgxLmNvjufRYO+E/u69qDR9CQ6byvIgkdXAz9GQ+4EylG67CnwMA9L7RNpPb0BjZNkjEyjOWOL\nWpEU/T4PM4Ur0IH/KmaR16IDP6Xnjw84xcDGiWjQfxcFpZ4jz0qeh/f/7XjPvIxZkg7KD5xVnVXk\nPrlzMSDRiXvYoxiYKuv3PxAOw31gKwbyLkcDOq35aQzIanQ+b2Nfteb+shfG4P20Fq/Jx3Fd/QVZ\nxO947E2+o/a4vXY8ezG4V3aGOc2mTQHfjWi3HI+2zOfI/al3YoXRFKw8eIa+70+NHIn28Im1Y3gN\ny5e3kNXLL0N7a21Pf0FwUCzAdqZxlDsWJugDjfM8z8fNcQPWqf8jjrcAI0lfwwVzBEalP041Ziq2\nGlUwNlJWoVhDfwl5w1mKtfTLMUI/h6A3Gg2DmVhml2Zh/hz71j6Mvb/govkYGh2dVIdiJcRYVLX9\nZxTPGVZ7/TIsH5qE68OzGLk/CR3W/uoZS0rGu9D4TEral6HxvhUj9R1Y0vQUfZvXdqgU7+dhaESm\nbMFJWE43ufA6tcd/icJMUTmyf1L0+1Fy9HsHlmDdj/dU2QJ+c7EMfDsavhejc/QzbLdJvby/jIHc\nLxJr6sHQhkr7M2qPHyaPfqv6+I8uFMzahoGSi3HN70ZHNImynYbOyUDO9AQzsJvRWerGbOxwzNR/\nq/C+U8hVBodjIGU+5dLYNpSqIj6GarJpfb2IrDA/HZ3sRQxcP/M4XJ/W4VSLtaincQXe/xuwbWlJ\nb39B8I4Mwaz3l9B2XVHq0QSHTG9lgtuAb5CNpd/CTT71T92Pi34YTK1Psab+fOz32I5RRDCinMoE\ng545m/pemLm1/y9DR2ghZu3SkOQJ2E/zWeDvcUOvagDnV/Befwaz4sn5a8cI+LLa4+sw0ru6n//9\nMzHDdQOW1b2Mzu7w2jHch+ON1qP63UaaH5WfTDaIwXP2Qu2YNqEhuQz7Y1cU3nc5RsxnU63gQ6tw\nJmY4htE3Uav+ojH78T0MiIDXQTce77PkkRrBwTOP7CBNJ69BVeYodEwm1h5fjwb00biu3UfvTt1A\nVUgdhkG916nvRZ6E86irKODS07mYDvw3WZjpCWwRAc/3OsySDcGM50AG/kdjRvtp6md3Ho+/82qq\no27bynwQq0fKFEYL+oljcEzCtXijHI0GZyrDmIob6KcKjxvnPQWtSWNN/SaMKu/FCN5XKF89supc\nhNHNE9FZeh0dpj9BVbuHyb0nnWSJ9ZXU96SUTVEwqQMdpO1Y7tiOAhQ3kaPhazGivB0d7IHoY7kA\nhTFSZmstZkOTsXE6KnR+gHplyGb2Ac/AcuAFeH7WkY23Z7EndhwGeL6JUfDdeK1UuVywihSj33up\nVvS7mP34FgYg2rD/dzv5t/71Hj8dHAzfIM9QrSrHkks8R+EekdbIaXgNX4fXyIO43haD+o16HP3N\nEAyQbCg898fYN1mludnQsyjmH5FHvt2JiRPQNv02OSN+HmZGmxWYvBqD+rOxDWQP2tfNnpsbBJWj\ncUE7BctCbsDFaDcax9dhVG5M7TPLcHN9T3MOMyiJBajSOZGI1B0M96BBfDbeM7ejs/QT7FUCxRC6\nsTy1SvRk5IwnlwanKONZ1M8wfj9eL0nRb6DYQTaSpqBDt4KcNZxdeG+jYFEz6MSgw0/w91+JvT97\nqVc470DD81LKGwczGKhq9Lu37McibFuZ29OHgkOizHm470QbOkevYWDq87h2XoPBicSfo4rzNMqb\nU30UHt+TWF2SxmglytanaHTC2zG493WsJtiF60E7tgqlftmkJAzN/w4jUNNhG5aIrtz/24Ng8NNb\nmeDHUNkMvFGvxYVxOPZSpUh+JxHBGaxETX3fOQnLgNJ9tQiFh7ZhUOcBjMreWsbBHSBLcGP/Hcwm\nzsfjLzp4XSiO8SY6Ac0wBE/CnqmkuNhFvVJkoqye6nbMfL6Kxsc5GKxLFSSph7VqWYWg/ylmP2bg\nHno/Idz2buF0LBFO2bixaF/dg9UcO7GS5Ap0+h4kz/6Echy+NqwgKWaUy3Y8G5XLO/C+ehj3n6S+\n/xlymfsKcmn2+yg/yNdJtQMlQdA0eioTXIsRryQD3o4luDuxXGE1KvhVYQZYMLBUNavQSmzE4A14\nz3Rh5m4BltpWRaVtJPuOz/kkiuHMxuztC7Xnv4jrQCrDToqTCwf+MOu4BbNI4LmtYiXGrah2OByN\nzr/BsRAvonp0WVmOoHkUsx8vUb1qh2BgOBmznl9AJ/R1sqjQzNrzJ2MQ6rNod43CQFqVBGl60wVp\nJlPxHKVZuGej0u2d2Pf9BmaTwf3qAbJy+F7KGb8UBME70FOZ4A2oKpdu9jPICp1BEBw44zECmwSb\nZqNjN7HXTzSfNjR4LsBezqQeeyOWXv0e9v6mod0fxojzufv8Tc3lKFyXjiAbSGVH6RuZCHwH558e\ngz3Vq/AcBu8uIvvx7mAUivn8J3Bv7bmR6DD9duF9f00etTQUqzfuxnEc05typNUnrecdGMD7DAYb\n12MQL4k0fQSDO6kF42YM/CV9hSAIKkhPZYI3obrlNzEa9yp5nmcVxoQEQStxFfb+VI2h5Pt5Hip0\nvo6iE2AVxH+gIzq29tzxtf/fhkIOwTuzHHvqX6L5WeIgCJrDEFwXn8dRZTtRsA/sBV6CLRcX4Jr6\nBLm3chgGp65v4vG2EpPQ6XwEg7lHAg+Rg6Xg6K20dx3FwAjgBUHQz/RUJngHljCcQ/lz1IKglRmB\nfdXNVGHdHz0pKR6L4g2byWViZwA/LrxnISp3TqdaJfitkFWajWW4QRAMPs5Eh/MPUVzoFuz5fpN6\nVfOlaFttIbc8pD2hSmtqmTSu50tx3vQ64DksVT4cZz7fg/NzAWahaNJYgiBoGXoqE1xNtcoEgyDo\nX6ZgpcOnyKqLW3D+2+G19yTlwCewImJp8w8zCIKg0izF0VAn1B5PxbmTr6AjWgw8Du3lz+922qg/\nT6k//vNk8aC52CbWhQG9TdjSkvarEMUMKk/c9PX8HM/JTdgY/69obP6szIMKgqDfGAr8b+3PbdjX\neTqOCRmNwaenUHTsNOwL/3ecrfc48CNgDZaSBkEQBJnvo8L1FFQN/wVWkOzCMtG3sBcc8jo8BJ3W\noJ6ktH4Knp92dOyfAP4N28bOwrn0P8XRX88B/1P7LwiCFqNqZYJBEPQPxfv5g+iMvo2RZFBu/xZU\nZ23HIFQ3BqKqpMYYBEFQVU4GfoiBPVBBfAkwoawDakE+gkrhH0JH9C0UxbyLrGR7GQZG15RxgEEQ\nBEEQHDhzgT3AY8BvAjswywkGnX4Nnc4PYWnTRcCc5h9mEARBy3IvBu66USTnvYXXqqbQXUWGY+nt\npSja9iTqElyNInkbUMgpAqNBEARBUHGOxNmdJ2K28xXs8f4X8liASSigcVsZBxgEQTAIGA88DVxR\neC4qyg6O47DcNvWA/hT4BKoHr8aS2yAIgiAIKs44FMJYh2Jjd9SePw+j9YnJTT6uIAiCwcYqDPJB\nOJ+HwnHY0zkVnc2/xTLcEBgKgiAIghZiNCrZPo0R+sRxOON3eRkHFQRBMAgZAVxJ6GkcKqPQ4dyL\nwk2hvB4EQRAELcrV2EszG5iBQg+3k8ucgiAIgqAqLCRmJwdBEARBSzMCuArl7V8EVpZ7OEEQBEEQ\nBEEQBMFgp5OYgxwEQRAEQRAEQRAEQRAEQRAEQRAEQRAEQRAEQRAEQRAEQRAEQRAEQRAEQRAEQRAE\nQRAEQRAEQRAEQRAEQRAEQRAEQRAEQRAEQRAEQRAEQRAEQRAEQRAEQRAEQRAEQQX5PwiRLL2Irt44\nAAAAAElFTkSuQmCC\n",
       "text": [
        "<matplotlib.figure.Figure at 0x10906a350>"
       ]
      }
     ],
     "prompt_number": 9
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# Note we can also use the Spark RDD method 'countByValue' to generate the occupation counts\n",
      "count_by_occupation2 = user_fields.map(lambda fields: fields[3]).countByValue()\n",
      "print \"Map-reduce approach:\"\n",
      "print dict(count_by_occupation2)\n",
      "print \"\"\n",
      "print \"countByValue approach:\"\n",
      "print dict(count_by_occupation)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "Map-reduce approach:\n",
        "{u'administrator': 79, u'retired': 14, u'lawyer': 12, u'healthcare': 16, u'marketing': 26, u'executive': 32, u'scientist': 31, u'student': 196, u'technician': 27, u'librarian': 51, u'programmer': 66, u'salesman': 12, u'homemaker': 7, u'engineer': 67, u'none': 9, u'doctor': 7, u'writer': 45, u'entertainment': 18, u'other': 105, u'educator': 95, u'artist': 28}\n",
        "\n",
        "countByValue approach:\n",
        "{u'administrator': 79, u'writer': 45, u'retired': 14, u'lawyer': 12, u'doctor': 7, u'marketing': 26, u'executive': 32, u'none': 9, u'entertainment': 18, u'healthcare': 16, u'scientist': 31, u'student': 196, u'educator': 95, u'technician': 27, u'librarian': 51, u'programmer': 66, u'artist': 28, u'salesman': 12, u'other': 105, u'homemaker': 7, u'engineer': 67}\n"
       ]
      }
     ],
     "prompt_number": 13
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "## Exploring the Movie Dataset"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "movie_data = sc.textFile(\"%s/ml-100k/u.item\" % PATH)\n",
      "print movie_data.first()\n",
      "num_movies = movie_data.count()\n",
      "print \"Movies: %d\" % num_movies"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "1|Toy Story (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Toy%20Story%20(1995)|0|0|0|1|1|1|0|0|0|0|0|0|0|0|0|0|0|0|0\n",
        "Movies: 1682"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n"
       ]
      }
     ],
     "prompt_number": 24
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "def convert_year(x):\n",
      "    try:\n",
      "        return int(x[-4:])\n",
      "    except:\n",
      "        return 1900 # there is a 'bad' data point with a blank year, which we set to 1900 and will filter out later\n",
      "\n",
      "movie_fields = movie_data.map(lambda lines: lines.split(\"|\"))\n",
      "years = movie_fields.map(lambda fields: fields[2]).map(lambda x: convert_year(x))\n",
      "# we filter out any 'bad' data points here\n",
      "years_filtered = years.filter(lambda x: x != 1900)\n",
      "# plot the movie ages histogram\n",
      "movie_ages = years_filtered.map(lambda yr: 1998-yr).countByValue()\n",
      "values = movie_ages.values()\n",
      "bins = movie_ages.keys()\n",
      "hist(values, bins=bins, color='lightblue', normed=True)\n",
      "fig = matplotlib.pyplot.gcf()\n",
      "fig.set_size_inches(16,10)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "display_data",
       "png": "iVBORw0KGgoAAAANSUhEUgAAA6kAAAJPCAYAAACetZKYAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAGH9JREFUeJzt3W2MZfVBx/HfZWcJQp92tILAGghFlxprqwmlttqp1gYa\nKSa8qMSHptXKC6lNbCrWF2Z45UM0moZYSYOmMU1J7LaGxiItqZM0puVBy4PKVqAlARYQ26VpmzTy\nML44l91hXLhzZx/mt3M+n+Rmz73n/Oeenf+d2f3OOfdMAgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\nAMA2dnGSfUnuS3L1Ydb/SpK7ktyd5F+SvGaOsQAAALBhO5Lcn+ScJDuT3JnkgnXbvCHJy6fLFyf5\n8hxjAQAA4KCTZqy/MENoPpjkqSQ3JLls3TZfSvKt6fKtSc6eYywAAAAcNCtSz0ry0Jr7D08feyG/\nkeSzmxwLAADAyC3MWL86x8d6S5L3JHnjJsYCAADAzEh9JMnuNfd3Zzgiut5rknw0w3tSD8wz9rzz\nzlt94IEHNrq/AAAAnFgeSPKqjW48mbF+IclXk/x8kv1JbktyRZJ712zzw0m+kORXc+iiSRsdmySr\nq6sOuo7V8vJylpeXt3o32ALmftzM/3iZ+3Ez/+Nl7sdtMpkks9vzoFlHUp9OclWSmzNcrff6DJF5\n5XT9dUn+MMmuJB+ZPvZUhosmvdBYAAAAOKxZkZokN01va123Zvk3p7eNjgUAAIDDmnV1XzimlpaW\ntnoX2CLmftzM/3iZ+3Ez/+Nl7pnHhs8LPoa8JxUAAGCbmvc9qY6kAgAAUEOkAgAAUEOkAgAAUEOk\nAgAAUEOkAgAAUEOkAgAAUEOkAgAAUEOkAgAAUEOkAgAAUEOkAgAAUEOkAgAAUEOkAgAAUEOkAgAA\nUEOkAgAAUEOkAgAAUEOkAgAAUEOkAgAAUEOkAgAAUEOkAgAAUEOkAgAAUEOkAgAAUEOkAgAAUEOk\nAgAAUEOkAgAAUEOkAgAAUEOkAgAAUEOkAgAAUEOkAgAAUEOkAgAAUEOkAgAAUEOkAgAAUEOkAgAA\nUEOkAgAAUEOkAgAAUEOkAgAAUEOkAgAAUEOkAgAAUEOkAgAAUEOkAgAAUEOkAgAAUEOkAgAAUEOk\nAgAAUEOkAgAAUEOkAgAAUEOkAgAAUEOkHkW7FhczmUzmui3s3Dn3mF2Li1v9VwUAADgmJlu9A0lW\nV1dXt3ofjorJZJK9+/bPNebyPWduasx2+ZwBAADb22QySeZoT0dSAQAAqCFSAQAAqCFSAQAAqCFS\nAQAAqCFSAQAAqCFSAQAAqCFSAQAAqCFSAQAAqCFSAQAAqCFSAQAAqCFSAQAAqCFSAQAAqCFSAQAA\nqCFSAQAAqCFSAQAAqCFSAQAAqCFSAQAAqCFSAQAAqCFSAQAAqCFSAQAAqCFSAQAAqCFSAQAAqCFS\nAQAAqCFSAQAAqCFSAQAAqCFSAQAAqCFSAQAAqCFSAQAAqCFSAQAAqCFSAQAAqCFSAQAAqCFSAQAA\nqCFSAQAAqCFSAQAAqCFSAQAAqCFSAQAAqCFSAQAAqCFSAQAAqCFSAQAAqCFSAQAAqCFSAQAAqCFS\nAQAAqCFSAQAAqCFSAQAAqCFSAQAAqCFSAQAAqCFSAQAAqCFSAQAAqCFSAQAAqCFSAQAAqCFSAQAA\nqCFSAQAAqCFSAQAAqCFSAQAAqCFSAQAAqCFSAQAAqCFSAQAAqCFSAQAAqCFSAQAAqCFSAQAAqCFS\nAQAAqCFSAQAAqCFSAQAAqCFSAQAAqCFSAQAAqCFSAQAAqCFSAQAAqCFSAQAAqCFSAQAAqCFSAQAA\nqCFSAQAAqCFSAQAAqCFSAQAAqCFSAQAAqCFSAQAAqCFSAQAAqCFSAQAAqCFSAQAAqCFSAQAAqCFS\nAQAAqCFSAQAAqCFSAQAAqLGRSL04yb4k9yW5+jDr9yT5UpLvJfnAunUPJrk7yVeS3LbpvQQAAGAU\nFmas35Hk2iRvTfJIktuT3Jjk3jXbfCPJ+5L80mHGryZZSvLNI91RAAAAtr9ZR1IvTHJ/hiOiTyW5\nIcll67Z5Iskd0/WHMzmC/QMAAGBEZkXqWUkeWnP/4eljG7Wa5JYMEfve+XYNAACAsZl1uu/qEX78\nNyZ5NMkrk3w+w3tbv7h+o+Xl5YPLS0tLWVpaOsKnBQAAYCusrKxkZWVl0+NnReojSXavub87w9HU\njXp0+ucTST6d4fThF41UAAAATlzrDzxec801c42fdbrvHUnOT3JOkpOTvDPDhZMOZ/17T09N8tLp\n8mlJ3pbknrn2DgAAgFGZdST16SRXJbk5w5V+r89wZd8rp+uvS3JGhqv+vizJs0nen+TVSX4wyafW\nPM/Hk3zuKO47AAAA28ysSE2Sm6a3ta5bs/xYnn9K8HO+k+S1m9wvAAAARmjW6b4AAABw3IhUAAAA\naohUAAAAaohUAAAAaohUAAAAaohUAAAAaohUAAAAaohUAAAAaohUAAAAaohUAAAAaohUAAAAaohU\nAAAAaohUAAAAaohUAAAAaohUAAAAaohUAAAAaohUAAAAaohUAAAAaohUAAAAaohUAAAAaohUAAAA\naohUAAAAaohUAAAAaohUAAAAaohUAAAAaohUAAAAaohUAAAAaohUAAAAaohUAAAAaohUAAAAaohU\nAAAAaohUAAAAaohUAAAAaohUAAAAaohUAAAAaohUAAAAaohUAAAAaohUAAAAaohUAAAAaohUAAAA\naohUAAAAaohUAAAAaohUAAAAaohUAAAAaohUAAAAaohUAAAAaohUAAAAaohUAAAAaohUAAAAaohU\nAAAAaohUAAAAaohUAAAAaohUAAAAaohUAAAAaohUAAAAaohUAAAAaohUAAAAaohUAAAAaohUAAAA\naohUAAAAaohUAAAAaohUAAAAaohUAAAAaohUAAAAaohUAAAAaohUAAAAaohUAAAAaohUAAAAaohU\nAAAAaohUAAAAaohUAAAAaohUAAAAaohUAAAAaohUAAAAaohUAAAAaohUAAAAaohUAAAAaohUAAAA\naohUAAAAaohUAAAAaohUAAAAaohUAAAAaohUAAAAaohUAAAAaohUAAAAaohUAAAAaohUAAAAaohU\nAAAAaohUAAAAaohUAAAAaohUAAAAaohUAAAAaohUAAAAaohUAAAAaohUAAAAaohUAAAAaohUAAAA\naohUAAAAaohUAAAAaohUAAAAaohUAAAAaohUAAAAaohUAAAAaohUAAAAaohUAAAAaohUAAAAaohU\nAAAAaohUAAAAaohUAAAAaohUAAAAaohUAAAAaohUAAAAaohUAAAAaohUAAAAaohUAAAAaohUAAAA\naohUAAAAaohUAAAAaohUAAAAaohUAAAAaohUAAAAaohUAAAAaohUAAAAaohUAAAAaohUAAAAaohU\nAAAAaohUAAAAaohUAAAAaohUAAAAaohUAAAAamwkUi9Osi/JfUmuPsz6PUm+lOR7ST4w51gAAAA4\naFak7khybYbYfHWSK5JcsG6bbyR5X5I/28RYAAAAOGhWpF6Y5P4kDyZ5KskNSS5bt80TSe6Yrp93\nLAAAABw0K1LPSvLQmvsPTx/biCMZCwAAwAjNitTVI/jYRzIWAACAEVqYsf6RJLvX3N+d4YjoRmx4\n7PLy8sHlpaWlLC0tbfApAAAAaLKyspKVlZVNj5/MWL+Q5KtJfj7J/iS3ZbgA0r2H2XY5ybeT/Pmc\nY1dXV7fHQdfJZJK9+/bPNebyPWduasx2+ZwBAADb22QySWa350GzjqQ+neSqJDdnuFrv9Rki88rp\n+uuSnJHk9iQvS/JskvdnuJrvd15gLAAAABzWrEhNkpumt7WuW7P8WJ5/Wu+ssQAAAHBYsy6cBAAA\nAMeNSAUAAKCGSAUAAKCGSAUAAKCGSAUAAKCGSAUAAKCGSAUAAKCGSAUAAKCGSAUAAKCGSAUAAKCG\nSAUAAKCGSAUAAKCGSAUAAKCGSAUAAKCGSAUAAKCGSAUAAKCGSAUAAKCGSAUAAKCGSAUAAKCGSAUA\nAKCGSAUAAKDGwlbvwPGya3ExTx44sOHtdyws5Jmnnz6GewQAAMB6o4nUJw8cyN59+ze8/eV7zpxr\n++fGAAAAsHlO9wUAAKCGSAUAAKCGSAUAAKCGSAUAAKCGSAUAAKCGSAUAAKCGSAUAAKCGSAUAAKCG\nSAUAAKCGSAUAAKCGSAUAAKCGSAUAAKCGSAUAAKCGSAUAAKCGSAUAAKCGSAUAAKCGSAUAAKCGSAUA\nAKCGSAUAAKCGSAUAAKCGSAUAAKCGSAUAAKCGSAUAAKCGSAUAAKCGSAUAAKCGSAUAAKCGSAUAAKCG\nSAUAAKCGSAUAAKCGSAUAAKCGSAUAAKCGSAUAAKCGSAUAAKCGSAUAAKCGSAUAAKCGSAUAAKCGSAUA\nAKCGSAUAAKCGSAUAAKCGSAUAAKDGCRmpuxYXM5lM5rqN2WY+Xws7d849Ztfi4lb/VQEAgBPcwlbv\nwGY8eeBA9u7bP9eYy/eceYz2pt9mP18+xwAAwPF2Qh5JBQAAYHsSqQAAANQQqQAAANQQqQAAANQQ\nqQAAANQQqQAAANQQqQAAANQQqQAAANQQqQAAANQQqQAAANQQqQAAANQQqQAAANQQqQAAANQQqQAA\nANQQqQAAANQQqQAAANQQqQAAANQQqQAAANQQqQAAANQQqQAAANQQqQAAANQQqQAAANQQqQAAANQQ\nqQAAANQQqQAAANQQqQAAANQQqQAAANQQqQAAANQQqQAAANQQqQAAANQQqQAAANQQqQAAANQQqQAA\nANQQqQAAANQQqQAAANQQqQAAANQQqQAAANQQqQAAANQQqQAAANQQqQAAANQQqQAAANQQqQAAANQQ\nqQAAANQQqQAAANQQqQAAANQQqQAAANQQqQAAANQQqQAAANQQqQAAANQQqQAAANQQqQAAANQQqQAA\nANQQqQAAANQQqQAAANQQqQAAANQQqQAAANQQqQAAANQQqQAAANQQqQAAANQQqQAAANQQqQAAANQQ\nqQAAANQQqQAAANQQqQAAANQQqQAAANQQqQAAANTYSKRenGRfkvuSXP0C23x4uv6uJK9b8/iDSe5O\n8pUkt216LwEAABiFhRnrdyS5NslbkzyS5PYkNya5d802b0/yqiTnJ3l9ko8kuWi6bjXJUpJvHrU9\nBgAAYNuadST1wiT3Zzgi+lSSG5Jctm6bdyT52HT51iSvSHL6mvWTI95LAAAARmFWpJ6V5KE19x+e\nPrbRbVaT3JLkjiTv3fxuAgAAMAazTvdd3eDHeaGjpW9Ksj/JK5N8PsN7W7+4fqPl5eWDy0tLS1la\nWtrg0wIAANBkZWUlKysrmx4/K1IfSbJ7zf3dGY6Uvtg2Z08fS4ZATZInknw6w+nDLxqpAAAAnLjW\nH3i85ppr5ho/63TfOzJcEOmcJCcneWeGCyetdWOSX58uX5TkySSPJzk1yUunj5+W5G1J7plr7wAA\nABiVWUdSn05yVZKbM1zp9/oMV/a9crr+uiSfzXCF3/uTfDfJu6frzkjyqTXP8/EknztaOw4AAMD2\nMytSk+Sm6W2t69bdv+ow476W5LWb2SkAAADGadbpvgAAAHDciFQAAABqiFQAAABqiFQAAABqiFQA\nAABqiFQAAABqiFQAAABqiFQAAABqLGz1DjC/HTt2ZDKZbPVuAAAAHHUi9QT0zDPPZO++/Rve/vI9\nZx7DvQEAADh6nO4LAABADZEKAABADZEKAABADZEKAABADZEKAABADZEKAABADZEKAABADZEKAABA\nDZEKAABADZEKAABADZEKAABADZEKAABADZEKAABADZEKAABADZEKAABADZEKAABADZEKAABADZEK\nAABADZEKAABADZEKAABADZEKAABADZEKAABADZEKAABADZEKAABADZEKAABADZEKAABADZEKAABA\nDZEKAABADZEKAABADZEKAABADZEKAABADZEKAABADZEKAABADZEKAABADZEKAABADZEKAABADZEK\nAABADZEKAABADZEKAABADZEKAABADZHKUbNjx45MJpO5brsWF7d6tw9r1+Licfm7zPs8rZ8vAAA4\nWha2egfYPp555pns3bd/rjGX7znzGO3NkXnywIHj8neZ93laP18AAHC0OJIKAABADZEKAABADZEK\nAABADZEKAABADZEKAABADZEKAABADZEKAABADZEKAABADZEKAABADZEKAABADZEKAABADZEKAABA\nDZEKAABADZEKAABADZEKAABADZEKAABADZEKAABADZEKAABADZEKAABADZEKAABADZEKAABADZEK\nAABADZEKAABADZEKAABADZEKAABADZEKAABADZEKAABADZEKAABADZEKAABADZEKAABADZEKAABA\nDZEKAABADZEKAABADZEKAABADZEKAABADZEKAABADZEKAABADZEKAABADZEKAABADZEKAABADZEK\nAABADZHKCWXX4mImk8lct4WdO+cew/axmdfMrsXFuueAeXldAvTwPXk+C1u9AzCPJw8cyN59++ca\nc/meMzc1hu1hs6+ZtueAeXldAvTwPXk+jqQCAABQQ6QCAABQQ6QCAABQQ6QCAABQQ6QCAABQQ6QC\nAABQQ6QCAABQQ6QCAABQQ6QCAABQQ6QCAABQQ6QCAABQQ6QCAABQQ6QCAABQQ6QCAABQQ6QCAABQ\nQ6QCAABQQ6QCAABQQ6QCAABQQ6QCAABQQ6QCAABQQ6QCAABQQ6QCAABQQ6QCAABQQ6QCAABQQ6QC\nAABQQ6QCAABQQ6QCAABQQ6QCAABQQ6QCAABQQ6QCAABQQ6QCAABQQ6QCAABQQ6QCAABQQ6QCAABQ\nQ6QCAABQYyORenGSfUnuS3L1C2zz4en6u5K8bs6xAAAAkGR2pO5Icm2G2Hx1kiuSXLBum7cneVWS\n85P8VpKPzDEWGKmVlZWt3gW2kPkfL3M/buZ/vMw985gVqRcmuT/Jg0meSnJDksvWbfOOJB+bLt+a\n5BVJztjgWGCk/GM1buZ/vMz9uJn/8TL3zGNWpJ6V5KE19x+ePraRbc7cwFgAAAA4aGHG+tUNfpzJ\nkezEpZdeuuFtTznllCN5KgAAAIrNisuLkixneF9pknwoybNJ/mTNNn+dZCXD6bzJcKGkNyc5dwNj\nk+GU4PPm3XEAAABOCA9kuI7RUbEw/YDnJDk5yZ05/IWTPjtdvijJl+cYCwAAAHO5JMlXMxzx/ND0\nsSunt+dcO11/V5KfnDEWAAAAAAAAgBdzcYb3sN6X5Oot3heOrb9J8niSe9Y8tpjk80n+K8nnMvz6\nIran3Un+Ocl/JPn3JL8zfdxrYPs7JcOvJ7szyX8m+aPp4+Z+XHYk+UqSz0zvm/9xeDDJ3Rnm/rbp\nY+Z+PF6R5JNJ7s3w/f/1Mf9j8KMZvuafu30rw//7Tpi535HhNOBzkuyM96xudz+T5HV5fqT+aZLf\nmy5fneSPj/dOcdyckeS10+WXZHgbwAXxGhiLU6d/LmS4bsGbYu7H5neTfDzJjdP75n8cvp7hP6Zr\nmfvx+FiS90yXF5K8POZ/bE5K8miGgxUnzNy/Ick/rbn/+9Mb29c5eX6k7kty+nT5jOl9xuEfkrw1\nXgNjc2qS25P8WMz9mJyd5JYkb8mhI6nmfxy+nuT71z1m7sfh5Um+dpjHzf+4vC3JF6fLc839Scdw\np2Y5K8lDa+4/PH2M8Tg9wynAmf55+otsy/ZxToaj6rfGa2AsTspwtszjOXTat7kfj79I8sEMv4bu\nOeZ/HFYz/IDijiTvnT5m7sfh3CRPJPnbJP+W5KNJTov5H5tfTvKJ6fJcc7+Vkbq6hc9Nn9V4TYzB\nS5LsTfL+JN9et85rYPt6NsPp3mcn+dkMR9TWMvfb1y8m+e8M70t6od/Nbv63rzdm+KHkJUl+O8Nb\nf9Yy99vXQobf+PFX0z+/m/9/xqT5395OTnJpkr8/zLqZc7+VkfpIhvOTn7M7w9FUxuPxDIf7k+SH\nMvxHhu1rZ4ZA/bsMp/smXgNj860k/5jkp2Lux+Knk7wjw2mfn0jycxm+B5j/cXh0+ucTST6d5MKY\n+7F4eHq7fXr/kxli9bGY/7G4JMm/Zvj6T+b82t/KSL0jyfkZTv07Ock7c+iCCozDjUneNV1+Vw6F\nC9vPJMn1Ga7u95drHvca2P5+IIeu4Pd9SX4hw1E1cz8Of5Dhh9DnZjjt6wtJfi3mfwxOTfLS6fJp\nGd6bdk/M/Vg8luFtfT8yvf/WDG/1+EzM/1hckUOn+iYn2Nf+JRmu8nl/kg9t8b5wbH0iyf4k/5vh\nm9a7M1zx75acAJei5oi9KcMpn3fm0CXJL47XwBj8eIb3I92Z4VdRfHD6uLkfnzfn0A+jzf/2d26G\nr/s7M/zqsef+n2fux+MnMhxJvSvJpzJcTMn8j8NpSf4nh35QlZh7AAAAAAAAAAAAAAAAAAAAAAAA\nAAAAAAAAAAAAAAAAAADG6v8AkvLjXfcce0YAAAAASUVORK5CYII=\n",
       "text": [
        "<matplotlib.figure.Figure at 0x109dd7110>"
       ]
      }
     ],
     "prompt_number": 46
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "## Exploring the Rating Dataset"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "rating_data_raw = sc.textFile(\"%s/ml-100k/u.data\" % PATH)\n",
      "print rating_data_raw.first()\n",
      "num_ratings = rating_data_raw.count()\n",
      "print \"Ratings: %d\" % num_ratings"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "196\t242\t3\t881250949\n",
        "Ratings: 100000"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n"
       ]
      }
     ],
     "prompt_number": 31
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "rating_data = rating_data_raw.map(lambda line: line.split(\"\\t\"))\n",
      "ratings = rating_data.map(lambda fields: int(fields[2]))\n",
      "max_rating = ratings.reduce(lambda x, y: max(x, y))\n",
      "min_rating = ratings.reduce(lambda x, y: min(x, y))\n",
      "mean_rating = ratings.reduce(lambda x, y: x + y) / float(num_ratings)\n",
      "median_rating = np.median(ratings.collect())\n",
      "ratings_per_user = num_ratings / num_users\n",
      "ratings_per_movie = num_ratings / num_movies\n",
      "print \"Min rating: %d\" % min_rating\n",
      "print \"Max rating: %d\" % max_rating\n",
      "print \"Average rating: %2.2f\" % mean_rating\n",
      "print \"Median rating: %d\" % median_rating\n",
      "print \"Average # of ratings per user: %2.2f\" % ratings_per_user\n",
      "print \"Average # of ratings per movie: %2.2f\" % ratings_per_movie"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "Min rating: 1\n",
        "Max rating: 5\n",
        "Average rating: 3.53\n",
        "Median rating: 4\n",
        "Average # of ratings per user: 106.00\n",
        "Average # of ratings per movie: 59.00\n"
       ]
      }
     ],
     "prompt_number": 35
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# we can also use the stats function to get some similar information to the above\n",
      "ratings.stats()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 36,
       "text": [
        "(count: 100000, mean: 3.52986, stdev: 1.12566797076, max: 5.0, min: 1.0)"
       ]
      }
     ],
     "prompt_number": 36
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# create plot of counts by rating value\n",
      "count_by_rating = ratings.countByValue()\n",
      "x_axis = np.array(count_by_rating.keys())\n",
      "y_axis = np.array([float(c) for c in count_by_rating.values()])\n",
      "# we normalize the y-axis here to percentages\n",
      "y_axis_normed = y_axis / y_axis.sum()\n",
      "\n",
      "pos = np.arange(len(x_axis))\n",
      "width = 1.0\n",
      "\n",
      "ax = plt.axes()\n",
      "ax.set_xticks(pos + (width / 2))\n",
      "ax.set_xticklabels(x_axis)\n",
      "\n",
      "plt.bar(pos, y_axis_normed, width, color='lightblue')\n",
      "plt.xticks(rotation=30)\n",
      "fig = matplotlib.pyplot.gcf()\n",
      "fig.set_size_inches(16, 10)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "display_data",
       "png": "iVBORw0KGgoAAAANSUhEUgAAA6QAAAJQCAYAAACZy6lmAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAGxRJREFUeJzt3W+MZfd90OHPZDdGxE1UG9JadlxME8MmTZM0FXZwaJiW\nEJkK6qJ94Ub9I0ibGoTTIrVgilBZChK0KpVoXaIt8ouqijCVnLQGxTiJkGmoUzsu6SY23spusPCf\nBJzEqUilCLsZXpxrdzLdzZ3d2c13d/Z5pKs55/7OufOLdJy9nznnnlsAAAAAAAAAAAAAAAAAAAAA\nAAAAAAAAAAAAnGHXV8erR6pbTjB+Q3Ws+lj129V3bBt7rPr4auz+szpLAAAA9pUD1aPVVdWLq9+p\nXr1jm4u3LX/zavvn/c/q0rM4PwAAAM5TL1ozfk1LYD5WPVvd3nJGdLs/2Lb8NdVndoxv7GF+AAAA\n7FPrgvSK6vFt60+sntvpu6uHq7uqH9n2/Fb1oeqB6p2nP00AAAD2m4Nrxrd2+Tq/tnp8W/Ur1Z9f\nPf/m6lPVy6sPtnwW9cPbd3z961+/dezYsd3OFwAAgPPLseoNJxpYF6RPVlduW7+y5SzpyXx49Zp/\nqvpsS4xWPV29r+US4C8L0mPHjrW1tdvu5XQcOXKkI0eOTE8D9syxzH7hWGY/cByzXziWz76NjY3X\nn2xs3SW7D1RXt9zU6KLqxurOHdu8sj/6nOgbVz8/W72keulq/eLqbdUndjtpAAAA9rd1Z0ifq26u\n7m654+5tLZ8VvWk1frQ6XP1Ay02PvlB9z2rssuq9237Pe6oPnKmJAwAAcH5bF6S13Kjorh3PHd22\n/DOrx06f7CTXCfPVtbm5OT0FOCMcy+wXjmX2A8cx+4Vjeda58JUsWz5DCgAAsD9tbGzUSdpz3WdI\nAQAA4KwQpAAAAIwQpAAAAIwQpAAAAIwQpAAAAIwQpAAAAIwQpAAAAIwQpAAAAIw4OD0BAIAz7ZJL\nL+3zzzwzPQ3Yk6+95JKe+dznpqcBZ9XG9ASqra2trek5AAD7yMbGRnccf2p6GrAnhw9dnvfJ7Acb\nGxt1kvZ0yS4AAAAjBCkAAAAjBCkAAAAjBCkAAAAjBCkAAAAjBCkAAAAjBCkAAAAjBCkAAAAjBCkA\nAAAjBCkAAAAjBCkAAAAjBCkAAAAjBCkAAAAjBCkAAAAjBCkAAAAjBCkAAAAjBCkAAAAjBCkAAAAj\nBCkAAAAjBCkAAAAjBCkAAAAjBCkAAAAjBCkAAAAjBCkAAAAjBCkAAAAjBCkAAAAjBCkAAAAjBCkA\nAAAjBCkAAAAjBCkAAAAjBCkAAAAjBCkAAAAjBCkAAAAjBCkAAAAjBCkAAAAjBCkAAAAjBCkAAAAj\nBCkAAAAjBCkAAAAjBCkAAAAjBCkAAAAjBCkAAAAjBCkAAAAjBCkAAAAjBCkAAAAjBCkAAAAjBCkA\nAAAjBCkAAAAjBCkAAAAjBCkAAAAjBCkAAAAjBCkAAAAjBCkAAAAjBCkAAAAjBCkAAAAjBCkAAAAj\nBCkAAAAjBCkAAAAjBCkAAAAjBCkAAAAjdhOk11fHq0eqW04wfkN1rPpY9dvVd5zCvgAAAFygDq4Z\nP1DdWr21erL6aHVn9fC2bT5U/fpq+Zur91Wv2uW+AAAAXKDWnSG9pnq0eqx6trq95Yzodn+wbflr\nqs+cwr4AAABcoNYF6RXV49vWn1g9t9N3t5z5vKv6kVPcFwAAgAvQukt2t3b5Or+2enxb9SvVoVOZ\nxJEjR15Y3tzcbHNz81R2BwAA4Bxxzz33dM899+xq240142+qjrTcnKjqJ6ovVT/9Ffb5vZbLda/e\n5b5bW1u77V4AgPU2Nja64/hT09OAPTl86PK8T2Y/2NjYqJO057pLdh9oCcurqouqG1tuTLTdK7e9\n+BtXPz+7y30BAAC4QK27ZPe56ubq7pa75t7W8lnRm1bjR6vD1Q+03LjoC9X3rNkXAAAA1l6y+9Xg\nkl0A4IxyyS77gUt22S/2cskuAAAAnBWCFAAAgBGCFAAAgBGCFAAAgBGCFAAAgBGCFAAAgBGCFAAA\ngBGCFAAAgBGCFAAAgBGCFAAAgBGCFAAAgBGCFAAAgBGCFAAAgBGCFAAAgBGCFAAAgBGCFAAAgBGC\nFAAAgBGCFAAAgBGCFAAAgBGCFAAAgBGCFAAAgBGCFAAAgBGCFAAAgBGCFAAAgBGCFAAAgBGCFAAA\ngBGCFAAAgBGCFAAAgBGCFAAAgBGCFAAAgBGCFAAAgBGCFAAAgBGCFAAAgBGCFAAAgBGCFAAAgBGC\nFAAAgBGCFAAAgBGCFAAAgBGCFAAAgBGCFAAAgBGCFAAAgBEHpycAwLnjkksv7fPPPDM9DQDgAiFI\nAXjB5595pjuOPzU9Ddizw4cun54CALvgkl0AAABGCFIAAABGCFIAAABGCFIAAABGCFIAAABGCFIA\nAABGCFIAAABGCFIAAABGCFIAAABGCFIAAABGCFIAAABGCFIAAABGCFIAAABGCFIAAABGCFIAAABG\nCFIAAABGCFIAAABGCFIAAABGCFIAAABGCFIAAABGCFIAAABGCFIAAABGCFIAAABGCFIAAABGCFIA\nAABGCFIAAABGCFIAAABGCFIAAABG7CZIr6+OV49Ut5xg/HurY9XHq9+sXrdt7LHV8x+r7t/LRAEA\nANhfDq4ZP1DdWr21erL6aHVn9fC2bT5ZvaX6/ZZ4/aXqTauxrWqz+twZmzEAAAD7wrozpNdUj7ac\n6Xy2ur26Ycc2H2mJ0ar7qlfsGN/Y2xQBAADYj9YF6RXV49vWn1g9dzI/WL1/2/pW9aHqgeqdpzNB\nAAAA9qd1l+xuncJrfXv1jurN2557c/Wp6uXVB1s+i/rhnTseOXLkheXNzc02NzdP4dcCAABwrrjn\nnnu65557drXtustp31QdaflsaNVPVF+qfnrHdq+r3rva7tGTvNY/rb5Q/esdz29tbZ1K9wJwtmxs\nbHTH8aempwF7dvjQ5Y5lznuHD12e98nsBxsbG3WS9lx3ye4D1dXVVdVF1Y0tNzXa7htaYvT7+vIY\nfUn10tXyxdXbqk/sftoAAADsZ+su2X2uurm6u+WOu7e13GH3ptX40eonq0uqd6+ee7blZkiXtYTq\n87/nPdUHztTEAQAAOL+tC9Kqu1aP7Y5uW/6h1WOnT1ZvOM15AQAAsM+tu2QXAAAAzgpBCgAAwAhB\nCgAAwAhBCgAAwAhBCgAAwAhBCgAAwAhBCgAAwAhBCgAAwAhBCgAAwAhBCgAAwAhBCgAAwAhBCgAA\nwAhBCgAAwAhBCgAAwAhBCgAAwAhBCgAAwAhBCgAAwAhBCgAAwAhBCgAAwAhBCgAAwAhBCgAAwAhB\nCgAAwAhBCgAAwAhBCgAAwAhBCgAAwAhBCgAAwAhBCgAAwAhBCgAAwAhBCgAAwAhBCgAAwAhBCgAA\nwAhBCgAAwAhBCgAAwAhBCgAAwAhBCgAAwAhBCgAAwAhBCgAAwAhBCgAAwAhBCgAAwAhBCgAAwAhB\nCgAAwAhBCgAAwAhBCgAAwAhBCgAAwAhBCgAAwAhBCgAAwAhBCgAAwAhBCgAAwAhBCgAAwAhBCgAA\nwAhBCgAAwAhBCgAAwAhBCgAAwAhBCgAAwAhBCgAAwIiD0xMAAAD+uAMHDrSxsTE9DTirBCkAAJyD\n/vAP/7A7jj81PQ3Ys8OHLj/pmEt2AQAAGCFIAQAAGCFIAQAAGCFIAQAAGCFIAQAAGCFIAQAAGCFI\nAQAAGCFIAQAAGCFIAQAAGCFIAQAAGCFIAQAAGCFIAQAAGCFIAQAAGLGbIL2+Ol49Ut1ygvHvrY5V\nH69+s3rdKewLAADABWpdkB6obm0Jy9dUb69evWObT1ZvaQnRf1790insCwAAwAVqXZBeUz1aPVY9\nW91e3bBjm49Uv79avq96xSnsCwAAwAVqXZBeUT2+bf2J1XMn84PV+09zXwAAAC4gB9eMb53Ca317\n9Y7qzaexLwAAABeYdUH6ZHXltvUrW8507vS66t+1fF70mVPctyNHjrywvLm52ebm5pppAQAAcC56\n8L57e+j+e3e17caa8YPV71Z/pXqqur/l5kQPb9vmG6r/Un1f9VunuG/V1taWk6kA54KNjY3uOP7U\n9DRgzw4futyxzHnPccx+cfjQ5XWS9lx3hvS56ubq7pa75t7WEpQ3rcaPVj9ZXVK9e/Xcsy03NDrZ\nvgAAALA2SKvuWj22O7pt+YdWj93uCwAAAGvvsgsAAABnhSAFAABghCAFAABghCAFAABghCAFAABg\nhCAFAABghCAFAABghCAFAABghCAFAABghCAFAABghCAFAABghCAFAABghCAFAABghCAFAABghCAF\nAABghCAFAABghCAFAABghCAFAABghCAFAABghCAFAABghCAFAABghCAFAABghCAFAABghCAFAABg\nhCAFAABghCAFAABghCAFAABghCAFAABghCAFAABghCAFAABghCAFAABghCAFAABghCAFAABghCAF\nAABghCAFAABghCAFAABghCAFAABghCAFAABghCAFAABghCAFAABghCAFAABghCAFAABghCAFAABg\nhCAFAABghCAFAABghCAFAABghCAFAABghCAFAABghCAFAABghCAFAABghCAFAABghCAFAABghCAF\nAABghCAFAABghCAFAABghCAFAABghCAFAABghCAFAABghCAFAABghCAFAABghCAFAABghCAFAABg\nhCAFAABghCAFAABghCAFAABghCAFAABghCAFAABghCAFAABghCAFAABghCAFAABghCAFAABgxG6C\n9PrqePVIdcsJxg9VH6m+WP3YjrHHqo9XH6vuP+1ZAgAAsO8cXDN+oLq1emv1ZPXR6s7q4W3bfLZ6\nV/XdJ9h/q9qsPrfXiQIAALC/rDtDek31aMuZzmer26sbdmzzdPXAavxENvYwPwAAAPapdUF6RfX4\ntvUnVs/t1lb1oZZgfeepTQ0AAID9bN0lu1t7fP03V5+qXl59sOWzqB/eudGRI0deWN7c3Gxzc3OP\nvxYAAIAJD953bw/df++utl0XpE9WV25bv7LlLOlufWr18+nqfS2XAH/FIAUAAOD89dprr+u11173\nwvqv/uLPnXTbdZfsPlBdXV1VXVTd2HJToxPZ+VnRl1QvXS1fXL2t+sSa3wcAAMAFYt0Z0ueqm6u7\nW+64e1vLHXZvWo0frS5rufvuy6ovVT9avab6uuq9237Pe6oPnMG5AwAAcB5bF6RVd60e2x3dtvzp\nvvyy3ud9oXrDac4LAACAfW7dJbsAAABwVghSAAAARghSAAAARghSAAAARghSAAAARghSAAAARghS\nAAAARghSAAAARghSAAAARghSAAAARghSAAAARghSAAAARghSAAAARghSAAAARghSAAAARghSAAAA\nRghSAAAARghSAAAARghSAAAARghSAAAARghSAAAARghSAAAARghSAAAARghSAAAARghSAAAARghS\nAAAARghSAAAARghSAAAARghSAAAARghSAAAARghSAAAARghSAAAARghSAAAARghSAAAARghSAAAA\nRghSAAAARghSAAAARghSAAAARghSAAAARghSAAAARghSAAAARghSAAAARghSAAAARghSAAAARghS\nAAAARghSAAAARghSAAAARghSAAAARghSAAAARghSAAAARghSAAAARghSAAAARghSAAAARhycngDs\nF5dcemmff+aZ6WkAAMB5Q5DCGfL5Z57pjuNPTU8D9uTwocunpwAAXEBcsgsAAMAIQQoAAMAIQQoA\nAMAIQQoAAMAIQQoAAMAIQQoAAMAIQQoAAMAIQQoAAMAIQQoAAMAIQQoAAMAIQQoAAMAIQQoAAMAI\nQQoAAMAIQQoAAMAIQQoAAMAIQQoAAMCI3QTp9dXx6pHqlhOMH6o+Un2x+rFT3BcAAIAL1LogPVDd\n2hKWr6neXr16xzafrd5V/exp7AsAAMAFal2QXlM9Wj1WPVvdXt2wY5unqwdW46e6LwAAABeodUF6\nRfX4tvUnVs/txl72BQAAYJ87uGZ8aw+vvet9jxw58sLy5uZmm5ube/i1AAAATHnwvnt76P57d7Xt\nuiB9srpy2/qVLWc6d2PX+24PUgAAAM5fr732ul577XUvrP/qL/7cSbddd8nuA9XV1VXVRdWN1Z0n\n2XZjD/sCAABwgVl3hvS56ubq7pa75t5WPVzdtBo/Wl1WfbR6WfWl6kdb7qr7hZPsCwAAAGuDtOqu\n1WO7o9uWP92XX5q7bl8AAABYe8kuAAAAnBWCFAAAgBGCFAAAgBGCFAAAgBGCFAAAgBGCFAAAgBGC\nFAAAgBGCFAAAgBGCFAAAgBGCFAAAgBGCFAAAgBGCFAAAgBGCFAAAgBGCFAAAgBGCFAAAgBGCFAAA\ngBGCFAAAgBGCFAAAgBGCFAAAgBGCFAAAgBGCFAAAgBGCFAAAgBGCFAAAgBGCFAAAgBGCFAAAgBGC\nFAAAgBGCFAAAgBGCFAAAgBGCFAAAgBGCFAAAgBGCFAAAgBGCFAAAgBGCFAAAgBGCFAAAgBGCFAAA\ngBGCFAAAgBGCFAAAgBGCFAAAgBGCFAAAgBGCFAAAgBGCFAAAgBGCFAAAgBGCFAAAgBGCFAAAgBGC\nFAAAgBGCFAAAgBGCFAAAgBGCFAAAgBGCFAAAgBGCFAAAgBGCFAAAgBGCFAAAgBGCFAAAgBGCFAAA\ngBGCFAAAgBGCFAAAgBGCFAAAgBEHpydQtbGxMT0FAAAAvsrOiSC94/hT01OAPTt86PLpKQAAwHnF\nJbsAAACMEKQAAACMEKQAAACMEKQAAACMEKQAAACMEKQAAACMEKQAAACMEKQAAACMEKQAAACMEKQA\nAACMEKQAAACM2E2QXl8drx6pbjnJNj+/Gj9Wfcu25x+rPl59rLr/tGcJAADAvnNwzfiB6tbqrdWT\n1UerO6uHt23zndWrqqura6t3V29ajW1Vm9XnztiMAQAA2BfWnSG9pnq05Uzns9Xt1Q07tvmu6pdX\ny/dVX1t9/bbxjT3PEgAAgH1nXZBeUT2+bf2J1XO73War+lD1QPXO058mAAAA+826S3a3dvk6JzsL\n+peqp6qXVx9s+Szqh3f5mgAAAOxj64L0yerKbetXtpwB/UrbvGL1XC0xWvV09b6WS4D/WJD+h1/4\n2ReWv+ma63rttdetmzcAAADnoAfvu7eH7r93V9uuC9IHWm5WdFVLXN5YvX3HNndWN7d8vvRN1eer\n/129pOWmSP+3urh6W/XPTvRLbnzXj+9qsgAAAJzbXnvtl59k/NVf/LmTbrsuSJ9ric27W+LytpY7\n7N60Gj9avb/lTruPVn9Q/e3V2GXVe7f9nvdUH9j9/wwAAAD2s3VBWnXX6rHd0R3rN59gv09Wbzid\nSQEAALD/rbvLLgAAAJwVghQAAIARghQAAIARghQAAIARghQAAIARghQAAIARghQAAIARghQAAIAR\nghQAAIARghQAAIARghQAAIARghQAAIARghQAAIARghQAAIARghQAAIARghQAAIARghQAAIARghQA\nAIARghQAAIARghQAAIARghQAAIARghQAAIARghQAAIARghQAAIARghQAAIARghQAAIARghQAAIAR\nghQAAIARghQAAIARghQAAIARghQAAIARghQAAIARghQAAIARghQAAIARghQAAIARghQAAIARghQA\nAIARghQAAIARghQAAIARghQAAIARghQAAIARghQAAIARghQAAIARghQAAIARghQAAIARghQAAIAR\nghQAAIARghQAAIARghQAAIARghQAAIARghQAAIARghQAAIARghQAAIARghQAAIARghQAAIARghQA\nAIARghQAAIARghQAAIARghQAAIARghQAAIARghQAAIARghQAAIARghQAAIARghQAAIARghQAAIAR\nghQAAIARghQAAIARghQAAIARghQAAIARghQAAIARghQAAIARuwnS66vj1SPVLSfZ5udX48eqbznF\nfTnLHrzv3ukpwBnhWGa/cCyzHziO2S8cy7PWBemB6taWsHxN9fbq1Tu2+c7qVdXV1Q9X7z6Fffkq\neOh+/5GxPziW2S8cy+wHjmP2C8fyrHVBek31aPVY9Wx1e3XDjm2+q/rl1fJ91ddWl+1yXwAAAC5Q\n64L0iurxbetPrJ7bzTaX72JfAAAALlAba8YPt1xy+87V+vdV11bv2rbNf6z+VfWbq/UPtXxe9Kpd\n7Fv1O9XrT33qAAAAnAeOVW840cDBNTs+WV25bf3KljOdX2mbV6y2efEu9u1kEwMAAODCdrD6vZaz\nnRe1nM080U2N3r9aflP1W6ewLwAAAJzUX6t+t+UGRT+xeu6m1eN5t67Gj1VvXLMvAAAAAJw31n3E\nAADgVHzd9ARgP3tRf3QX5XV3U4Zz2YHqX1Y/Xb1teC6wV39iegJwBrxsegKwRweqn6oeafl4IYOE\nyv70jpabTf3U9ERgj/5y9dst32/8aPUvqutGZwSn50DLHemPVn8zZ/w5f/296jeqb12tr/vGBjjX\nvKUlRF+6Wn5sdDawD31N9evV36/+e/Wq1fMHxmYEp+8t1fdvW//5ljf1cD75q9X/qN5d3Vg9WP25\n0RnBqXv+JMY/qu6tfmlwLrAXb6ie3rb+jdUlQ3MhkbIf/b+Wfyjurv5sy1/i76i2JicFp+n/VA+1\nHL9bLWdKr6g+ODkpOEUvrj7ccgPAh6rNlpsAnuir0OBctdUSpYdbvoP+z7RcuvuJlveT3mdwvvh0\nyx8Fv7/lo0DvaHm//MXqeI7lrzpBuj/9/urng9XN1f9q+Qoe/2Bwvnlu9Xj+uP3xln8s7h+bEZy6\nz7ZcEvaylitY3lhdXX2pejj/v8z5YaPlWP3W6qnq8epvVB9djX9xaF5wOn6j+ictX1f5t1q+ovLa\nlj+EPzU3Ldifbmr5j+55L56aCOzBwZY/qNxVvXL13Dflj2qcf/7u6ufbq3/TH30OD84Xt1cXt1zi\neKwlTL9tdEZwei7bsX5X9dcnJnKhc1Oj/W2j5QYaT7e88fmF6ltGZwSn57mWP6Z8pnp99Z+qf9Dy\npgjOB8/f+OXdq5//vuWPK75ygPPNx6t/W/3XliuyfrflM9Jwvvn0tuVXtvzx+zNDc4F97SUtn136\nTPUjw3OBvfiLLZc4/rfqB4fnAnv1jS1/jf8L0xOBU/SPq//c8lnoqp+p/uHYbGBv/nT1K9UD1Q8P\nzwX2rR9rOTvqu+84372i5Y3QRdMTgdP0ouobql9uefPzztnpwGn5k9uWX1R9/dRE4Ay4uPo7eZ8M\nZ5XLsgHOHV/X8jlSb3443/kuXQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAuED8fxddEko2\nMyM8AAAAAElFTkSuQmCC\n",
       "text": [
        "<matplotlib.figure.Figure at 0x109d51390>"
       ]
      }
     ],
     "prompt_number": 59
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# to compute the distribution of ratings per user, we first group the ratings by user id\n",
      "user_ratings_grouped = rating_data.map(lambda fields: (int(fields[0]), int(fields[2]))).\\\n",
      "    groupByKey() \n",
      "# then, for each key (user id), we find the size of the set of ratings, which gives us the # ratings for that user \n",
      "user_ratings_byuser = user_ratings_grouped.map(lambda (k, v): (k, len(v)))\n",
      "user_ratings_byuser.take(5)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 60,
       "text": [
        "[(2, 62), (4, 24), (6, 211), (8, 59), (10, 184)]"
       ]
      }
     ],
     "prompt_number": 60
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# and finally plot the histogram\n",
      "user_ratings_byuser_local = user_ratings_byuser.map(lambda (k, v): v).collect()\n",
      "hist(user_ratings_byuser_local, bins=200, color='lightblue', normed=True)\n",
      "fig = matplotlib.pyplot.gcf()\n",
      "fig.set_size_inches(16,10)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "display_data",
       "png": "iVBORw0KGgoAAAANSUhEUgAAA7MAAAJPCAYAAACuOYO5AAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzt3X+sXnd9H/D3zbVZCWzEl0pJ/ENzlIQlqaaSdgruaMtd\nF1oTdQkoElkmBMuqEWk1MCqVJPtj2NI0CBoMRdFSawTklY7Q4S0yan5AEVelQ3VJl4QA8YgN0eyY\nOF2x0cj+WHLr/XGO7ceP773neXyv7/N87vN6SUf3/Piec76Pv3F03/6e83kSAAAAAAAAAAAAAAAA\nAAAAAAAAAAAAAAAAAJg425McSPJckrsWaXNfe/zpJNe3+34myf4kTyX5XpKP9bSfSfLVJN9P8pUk\nl6x4rwEAAJhY00kOJtmaZH2aYHptX5ubkjzSrr8lyZ/1HLu4/bmu3f/WdvsTST7Srt+V5OMr2WkA\nAAAm2y8leaxn++526fV7SW7r2T6Q5NK+Nhcn+VaS6xZoc1m7DQAAAAO5qOP4piSHe7aPtPu62mxu\n16fTzOYeS/L1NI8bJ02QPdauH8u54RcAAAAW1RVmTw54nalFzptP8uY04fZXk8wuco9B7wMAAABZ\n13H8hSRbera3pJl5XarN5nZfr58k+aMkv5hkLs1s7GVJXkxyeZKXFrr5lVdeefLQoUMdXQQAAKCg\nQ0muOt+Tu2Zmn0hydZoCUK9J827svr42+5K8t13fluREmrD6szlTpfi1Sd6e5pHjU+e8r11/X5KH\nF7r5oUOHcvLkSUvB5aMf/ejI+2AxfpO4GLvai/GrvRi/uouxq70Yv7pLkis7E+sSumZmX02yI8nj\nad5/fTDJs0nubI/vTlPJ+KY0VY9fTnJHe+zyJHvSBOaLkvx+kq+1xz6e5A+T/FaS55O8ezkfAgAA\ngMnSFWaT5NF26bW7b3vHAuc9k+QXFrnmj5PcOMC9AQAA4BxdjxnDeZmdnR11F1gG41eXsavN+NVm\n/OoydrUZv8nVX4V43Jxsn6UGAABgDZmamkqWkUnNzAIAAFDOxIXZf/uxj+XyTZvOWh577LFRdwsA\nAIAhDFIAak157uChvP2f3JFf+c13JUk+/+/+TQ4fPjziXgEAADCMiQuzSfK6v3VJ3njZxiTJay9+\n3Yh7AwAAwLAm7jFjAAAA6hNmAQAAKEeYBQAAoBxhFgAAgHKEWQAAAMoRZgEAAChHmAUAAKAcYRYA\nAIByhFkAAADKEWYBAAAoR5gFAACgHGEWAACAcoRZAAAAyhFmAQAAKEeYBQAAoBxhFgAAgHKEWQAA\nAMoRZgEAAChHmAUAAKAcYRYAAIByhFkAAADKEWYBAAAoR5gFAACgHGEWAACAcoRZAAAAyhFmAQAA\nKEeYBQAAoBxhFgAAgHKEWQAAAMoRZgEAAChHmAUAAKAcYRYAAIByhFkAAADKEWYBAAAoR5gFAACg\nHGEWAACAcoRZAAAAyhFmAQAAKEeYBQAAoBxhFgAAgHKEWQAAAMoRZgEAAChHmAUAAKAcYRYAAIBy\nhFkAAADKEWYBAAAoR5gFAACgHGEWAACAcoRZAAAAyhFmAQAAKEeYBQAAoBxhFgAAgHKEWQAAAMoR\nZgEAAChHmAUAAKAcYRYAAIByhFkAAADKEWYBAAAoR5gFAACgHGEWAACAcoRZAAAAyhFmAQAAKEeY\nBQAAoBxhFgAAgHKEWQAAAMoRZgEAAChHmAUAAKAcYRYAAIByhFkAAADKEWYBAAAoR5gFAACgHGEW\nAACAcoRZAAAAyhFmAQAAKEeYBQAAoBxhFgAAgHKEWQAAAMoRZgEAAChnkDC7PcmBJM8luWuRNve1\nx59Ocn27b0uSryf5bpLvJPlgT/udSY4kebJdtg/ZbwAAACbYuo7j00nuT3JjkheSfCvJviTP9rS5\nKclVSa5O8pYkDyTZluSVJB9O8lSS1yf5iyRfSROMTyb5VLsAAADAULpmZm9IcjDJ82nC6UNJbulr\nc3OSPe36/iSXJLk0yYtpgmyS/DRNAN7Uc97U+XYaAACAydYVZjclOdyzfSRnB9LF2mzua7M1zePH\n+3v2fSDNY8kPpgnAAAAAMJCuMHtywOv0z7L2nvf6JF9K8qE0M7RJ8yjyFUnenORHST454H0AAACg\n853ZF9IUcjplS5qZ16XabG73Jcn6JHuTfD7Jwz1tXupZ/0ySLy/WgZ07d55en52dzezsbEeXAQAA\nGDdzc3OZm5tbset1hdkn0hR22prkaJLbktze12Zfkh1p3qfdluREkmNpZmsfTPK9JJ/uO+fyNDOy\nSfKuJM8s1oHeMAsAAEBN/ZOTu3btWtb1usLsq2mC6uNpKhs/mKaQ053t8d1JHklT0fhgkpeT3NEe\ne2uS9yT5dpqv30mSe5I8luTeNI8Yn0zyw57rAQAAQKeuMJskj7ZLr9192zsWOO9Ps/g7ue8d4L4A\nAACwoK4CUAAAADB2hFkAAADKEWYBAAAoR5gFAACgHGEWAACAcoRZAAAAyhFmAQAAKEeYBQAAoBxh\nFgAAgHKEWQAAAMoRZgEAAChHmAUAAKAcYRYAAIByhFkAAADKEWYBAAAoR5gFAACgHGEWAACAcoRZ\nAAAAyhFmAQAAKEeYBQAAoBxhFgAAgHKEWQAAAMoRZgEAAChHmAUAAKAcYRYAAIByhFkAAADKEWYB\nAAAoR5gFAACgHGEWAACAcoRZAAAAyhFmAQAAKEeYBQAAoBxhFgAAgHKEWQAAAMoRZgEAAChHmAUA\nAKAcYRYAAIByhFkAAADKEWYBAAAoR5gFAACgHGEWAACAcoRZAAAAyhFmAQAAKEeYBQAAoBxhFgAA\ngHKEWQAAAMoRZgEAAChHmAUAAKAcYRYAAIByhFkAAADKEWYBAAAoR5gFAACgHGEWAACAcoRZAAAA\nyhFmAQAAKEeYBQAAoBxhFgAAgHKEWQAAAMoRZgEAAChHmAUAAKAcYRYAAIByhFkAAADKEWYBAAAo\nR5gFAACgHGEWAACAcoTZJB/+8O9kamrq9LJhZmbUXQIAAGAJ60bdgXHw8ss/zd4DR09v33rNxhH2\nBgAAgC5mZgEAAChHmAUAAKAcYRYAAIByhFkAAADKEWYBAAAoR5gFAACgHGEWAACAcoRZAAAAyhFm\nAQAAKEeYBQAAoBxhFgAAgHKEWQAAAMoRZgEAAChHmAUAAKAcYRYAAIByhFkAAADKEWYBAAAoZ5Aw\nuz3JgSTPJblrkTb3tcefTnJ9u29Lkq8n+W6S7yT5YE/7mSRfTfL9JF9JcsmwHQcAAGBydYXZ6ST3\npwm01yW5Pcm1fW1uSnJVkquTvD/JA+3+V5J8OMnPJdmW5LeTXNMeuztNmH1Tkq+12wAAADCQrjB7\nQ5KDSZ5PE04fSnJLX5ubk+xp1/enmWW9NMmLSZ5q9/80ybNJNi1wzp4k7zyv3gMAADCRusLspiSH\ne7aP5EwgXarN5r42W9M8fry/3b40ybF2/Vi7DQAAAAPpCrMnB7zO1BLnvT7Jl5J8KM0M7UL3GPQ+\nAAAAkHUdx19IU8jplC1pZl6XarO53Zck65PsTfL5JA/3tDmW5LI0jyJfnuSlxTqwc+fO0+uzs7OZ\nnZ3t6DIAAADjZm5uLnNzcyt2va4w+0Sawk5bkxxNcluaIlC99iXZkeZ92m1JTqQJq1NJHkzyvSSf\nXuCc9yW5t/35cBbRG2YBAACoqX9ycteuXcu6XleYfTVNUH08TWXjB9MUcrqzPb47ySNpKhofTPJy\nkjvaY29N8p4k307yZLvvniSPJfl4kj9M8ltpiku9e1mfAgAAgInSFWaT5NF26bW7b3vHAuf9aRZ/\nJ/fHSW4c4N4AAABwjq4CUAAAADB2hFkAAADKEWYBAAAoR5gFAACgHGEWAACAcoRZAAAAyhFmAQAA\nKEeYBQAAoBxhFgAAgHKEWQAAAMoRZgEAAChHmAUAAKAcYRYAAIByhFkAAADKEWYBAAAoR5gFAACg\nHGEWAACAcoRZAAAAyhFmAQAAKEeYBQAAoBxhFgAAgHKEWQAAAMoRZgEAAChHmAUAAKAcYRYAAIBy\nhFkAAADKEWYBAAAoR5gFAACgHGEWAACAcoRZAAAAyhFmAQAAKEeYBQAAoBxhFgAAgHKEWQAAAMoR\nZgEAAChHmAUAAKAcYRYAAIByhFkAAADKEWYBAAAoR5gFAACgHGEWAACAcoRZAAAAyhFmAQAAKEeY\nBQAAoBxhFgAAgHKEWQAAAMoRZgEAAChHmAUAAKAcYRYAAIByhFkAAADKEWYBAAAoR5gFAACgHGEW\nAACAcoRZAAAAyhFmAQAAKEeYBQAAoBxhFgAAgHKEWQAAAMoRZgEAAChHmAUAAKAcYRYAAIByhFkA\nAADKEWYBAAAoR5gFAACgHGEWAACAcoRZAAAAyhFmAQAAKEeYBQAAoBxhFgAAgHKEWQAAAMoRZgEA\nAChHmAUAAKAcYRYAAIByhFkAAADKEWYBAAAoR5gFAACgHGEWAACAcoRZAAAAyhFmAQAAKEeYBQAA\noBxhFgAAgHKEWQAAAMoRZgEAAChHmAUAAKCcQcLs9iQHkjyX5K5F2tzXHn86yfU9+z+b5FiSZ/ra\n70xyJMmT7bJ94B4DAAAw8brC7HSS+9OEzeuS3J7k2r42NyW5KsnVSd6f5IGeY5/LwkH1ZJJPpQm+\n1yd5bNiOAwAAMLm6wuwNSQ4meT7JK0keSnJLX5ubk+xp1/cnuSTJZe32N5IcX+TaU0P2FQAAAJJ0\nh9lNSQ73bB9p9w3bZiEfSPNY8oNpAjAAAAAMpCvMnhzwOv2zrF3nPZDkiiRvTvKjJJ8c8D4AAACQ\ndR3HX0iypWd7S5qZ16XabG73LeWlnvXPJPnyYg137tx5en12djazs7MdlwYAAGDczM3NZW5ubsWu\n1xVmn0hT2GlrkqNJbktTBKrXviQ70rxPuy3JiTQVjJdyeZoZ2SR5V86tdnxab5gFAACgpv7JyV27\ndi3rel1h9tU0QfXxNJWNH0zybJI72+O7kzySpqLxwSQvJ7mj5/wvJHlbkjemea/2X6epcHxvmkeM\nTyb5Yc/1AAAAoFNXmE2SR9ul1+6+7R2LnNs/i3vKewe4LwAAACyoqwAUAAAAjB1hFgAAgHKEWQAA\nAMoRZgEAAChHmAUAAKAcYRYAAIByhFkAAADKEWYBAAAoR5gFAACgHGEWAACAcoRZAAAAyhFmAQAA\nKEeYBQAAoBxhFgAAgHKEWQAAAMoRZgEAAChHmAUAAKAcYRYAAIByhFkAAADKEWYBAAAoR5gFAACg\nHGEWAACAcoRZAAAAyhFmAQAAKEeYBQAAoBxhFgAAgHKEWQAAAMoRZgEAAChHmAUAAKAcYRYAAIBy\nhFkAAADKEWYBAAAoR5gFAACgHGEWAACAcoRZAAAAyhFmAQAAKEeYBQAAoBxhFgAAgHKEWQAAAMoR\nZgEAAChHmAUAAKAcYRYAAIByhFkAAADKEWYBAAAoR5gFAACgHGEWAACAcoRZAAAAyhFmAQAAKEeY\nBQAAoBxhdgHT09OZmpo6vaxbv/6s7Q0zM6PuIgAAwERbN+oOjKP5+fnsPXD09Pat12w8ZxsAAIDR\nMTMLAABAOcIsAAAA5QizAAAAlCPMAgAAUI4wCwAAQDnCLAAAAOUIswAAAJQjzAIAAFCOMAsAAEA5\nwiwAAADlCLMAAACUI8wCAABQjjALAABAOcIsAAAA5QizAAAAlCPMAgAAUI4wCwAAQDnCLAAAAOUI\nswAAAJQjzAIAAFCOMAsAAEA5wiwAAADlCLMAAACUI8wCAABQjjALAABAOcIsAAAA5QizAAAAlCPM\nAgAAUI4wCwAAQDnCLAAAAOUIswAAAJQjzAIAAFCOMAsAAEA5g4TZ7UkOJHkuyV2LtLmvPf50kut7\n9n82ybEkz/S1n0ny1STfT/KVJJcM3mUAAAAmXVeYnU5yf5pAe12S25Nc29fmpiRXJbk6yfuTPNBz\n7HPtuf3uThNm35Tka+02AAAADKQrzN6Q5GCS55O8kuShJLf0tbk5yZ52fX+aWdbL2u1vJDm+wHV7\nz9mT5J3DdBoAAIDJ1hVmNyU53LN9pN03bJt+l6Z5/Djtz0s72gMAAMBpXWH25IDXmTrP8061HaY9\nAAAAE25dx/EXkmzp2d6SZuZ1qTab231LOZbmUeQXk1ye5KXFGu7cufP0+uzsbGZnZzsuDQAAwLiZ\nm5vL3Nzcil2vK8w+kaaw09YkR5PclqYIVK99SXakeZ92W5ITOfMI8WL2JXlfknvbnw8v1rA3zAIA\nAFBT/+Tkrl27lnW9rseMX00TVB9P8r0kX0zybJI72yVJHknygzSFonYn+Rc9538hyTfTVC0+nOSO\ndv/Hk7w9zVfz/Fq7DQAAAAPpmplNkkfbpdfuvu0di5zbP4t7yo+T3DjAvQEAAOAcXTOzAAAAMHaE\nWQAAAMoRZgEAAChHmAUAAKAcYRYAAIByhFkAAADKEWZXwIaZmUxNTZ1eNszMjLpLAAAAa9og3zNL\nhxPHj2fvgaOnt2+9ZuMIewMAALD2mZkFAACgHGEWAACAcoRZAAAAyhFmL4Dp6WkFoQAAAC4gBaAu\ngPn5eQWhAAAALiAzswAAAJQjzAIAAFCOMAsAAEA5wiwAAADlCLNjaMPMjGrIAAAAS1DNeAydOH5c\nNWQAAIAlmJkFAACgHGEWAACAcoRZAAAAyhFmAQAAKEeYBQAAoBxhFgAAgHKEWQAAAMoRZgEAAChH\nmAUAAKAcYRYAAIByhFkAAADKEWYBAAAoR5gFAACgHGEWAACActaNugMVTU9PZ2pqatTdAAAAmFjC\n7HmYn5/P3gNHT2/fes3GEfYGAABg8njMGAAAgHKEWQAAAMoRZgEAAChHmAUAAKAcYXYENszMZGpq\n6vSyYWZm1F0CAAAoRTXjEThx/LhqyAAAAMtgZhYAAIByhFkAAADKEWYBAAAoxzuzq2B6ejpTU1Oj\n7gYAAMCaIcyugvn5eQWfAAAAVpDHjAEAAChHmAUAAKAcYRYAAIByhFkAAADKEWYBAAAoR5gFAACg\nHGEWAACAcoRZAAAAyhFmAQAAKEeYBQAAoBxhFgAAgHKEWQAAAMoRZgEAAChHmAUAAKAcYRYAAIBy\nhFkAAADKEWYBAAAoR5gFAACgHGEWAACAcoRZAAAAylk36g6QTE9PZ2pqatTdAAAAKEOYHQPz8/PZ\ne+Do6e1br9k4wt4AAACMP48ZAwAAUI4wCwAAQDnCLAAAAOUIswAAAJQjzAIAAFCOMAsAAEA5wiwA\nAADlCLMAAACUI8wCAABQjjBb0IaZmUxNTZ1eNszMjLpLAAAAq2rdqDvA8E4cP569B46e3r71mo0j\n7A0AAMDqMzMLAABAOcIsAAAA5QizAAAAlCPMAgAAUI4wW8D09PRZ1YuHpfoxAACw1gwSZrcnOZDk\nuSR3LdLmvvb400muH+DcnUmOJHmyXbYP0+lJMz8/n70Hjp5ehnWq+vGp5cTx4xeglwAAAKun66t5\nppPcn+TGJC8k+VaSfUme7WlzU5Krklyd5C1JHkiyrePck0k+1S4AAAAwlK6Z2RuSHEzyfJJXkjyU\n5Ja+Njcn2dOu709ySZLLBjh3+OdlAQAAIN1hdlOSwz3bR9p9g7TZ2HHuB9I8lvxgmgAMAAAAA+kK\nsycHvM6ws6wPJLkiyZuT/CjJJ4c8HwAAgAnW9c7sC0m29GxvSTPDulSbzW2b9Uuc+1LP/s8k+fJi\nHdi5c+fp9dnZ2czOznZ0GQAAgHEzNzeXubm5FbteV5h9Ik1hp61Jjia5LcntfW32JdmR5p3YbUlO\nJDmW5K+WOPfyNDOySfKuJM8s1oHeMAsAAEBN/ZOTu3btWtb1usLsq2mC6uNpqhM/mKYa8Z3t8d1J\nHklT0fhgkpeT3NFxbpLcm+YR45NJfthzPQAAAOjUFWaT5NF26bW7b3vHEOcmyXsHuC8AAAAsqKsA\nFAAAAIwdYRYAAIByhFkAAADKEWYBAAAoR5gFAACgHGEWAACAcoRZAAAAyhFmAQAAKEeYBQAAoBxh\nFgAAgHKEWQAAAMoRZgEAAChHmF0DpqenMzU1dXrZMDMz6i6dZcPMzFn9W7d+/Vj3FwAAGH/rRt0B\nlm9+fj57Dxw9vX3rNRtH2JtznTh+/Jz+jXN/AQCA8WdmFgAAgHKEWQAAAMoRZgEAAChHmF2D+gtC\nsTz9BawUrAIAgNFTAGoNGveCUNUsVMAKAAAYLTOzAAAAlCPMAgAAUI4wCwAAQDnCLAAAAOUIs0xc\ntd5J+7wAALAWqWbMxFXrnbTPCwAAa5GZWQAAAMoRZgEAAChHmAUAAKAcYZYV119gCQAAYKUpAMWK\nU2AJAAC40MzMAgAAUI4wCwAAQDnCLAAAAOUIswAAAJQjzDK0/mrFG2ZmJro/o74/AABMItWMGdq4\nVSsedX9GfX8AAJhEZmYBAAAoR5gFAACgHGEWAACAcoRZlm16evqsAkjLPb+rgFJ/waVxp0AUAACs\nPAWgWLb5+fllFUAa9vxqBZeq9RcAACowMwsAAEA5wiwAAADlCLMAAACUI8wCAABQjjA7gZZbfXi1\nrXb1YtWSAQBg/KlmPIGWW314ta12NeBq1Yer9RcAAFaCmVkAAADKEWYBAAAoR5gFAACgHGGWNae/\nwNW69etLFXRarv7Pv9yCUMMWmFKQCgCA1aAAFGvOQgWuJqlA0koX+Bq2wJSCVAAArAYzswAAAJQj\nzAIAAFCOMAsAAEA5wiwAAADlCLOcY9TVgPvvP273u9D9W+vVgNd6deRq/QUAqEo1Y84x6mrAK12N\nd6Xvd6H7t9arAa/16sjV+gsAUJWZWQAAAMoRZgEAAChHmAUAAKAcYRZW2LAFovoLBq10+/7+9Bck\nqlawqOvzjJtq/QUAqEIBKFhhwxaIutAFkbr6U61g0WoXCFuuav0FAKjCzCwAAADlCLMAAACUI8wC\nAABQjjALAABAOcIsrLJhqx2Pm5Wuztt/vXXr169otea1Ztjq09WqVS/XqD/vqO8PAJNENWNYZdWr\n2650/xe63kpWa15rLnT16+pG/XlHfX8AmCRmZgEAAChHmAUAAKAcYRYAAIByhFkYcytdMKrresPe\nb9wKWi23IFR/AZ/+glT91+tvP2z/uq4/7roKHimItDR/PgBw/hSAgjG3GgWXlnO/cSvAtNz+LFTA\nZ6nrDVvwZ7kFr8ZN1+dXEGlp/nwA4PyZmQUAAKAcYRYAAIByhFkAAADKEWYBAAAoR5gFJsq4V49d\nbjXmYXVVb+7anjTj/t/PSpu0z9tv0j8/wLhTzRiYKONePXa1q0MPUr15LVVfXq5x/+9npU3a5+03\n6Z8fYNyZmQUAAKAcYRYAAIByhFkAAADKEWaBC6q/oNG43X/Y/o3683T1p6tA07j1f9gCO8O27/rz\n6d/uv17//S60rv529W/cChSNW/+6Cp6NW/9WuwDcqD8/Z1vr4+PzsRIGKQC1Pcmnk0wn+UySexdo\nc1+SdyT5v0n+aZInO86dSfLFJH87yfNJ3p3kxHn0Hxhzq13QaNj7D9u/UX+efgv1ZyU/74U2bIGd\nYdsP8uez1PVWuwDQsOM57gWKxq1/gxQ8G6XV/vMat/HhbGt9fHw+VkLXzOx0kvvThNLrktye5Nq+\nNjcluSrJ1Unen+SBAc69O8lXk7wpydfabdaQ7+z/5qi7wDIYP4Dhzc3NjboLnCdjV5vxm1xdYfaG\nJAfTzJ6+kuShJLf0tbk5yZ52fX+SS5Jc1nFu7zl7krzzPPvPmPrunwtDlRk/gOH5hbouY1eb8Ztc\nXWF2U5LDPdtH2n2DtNm4xLmXJjnWrh9rtwEAAGAgXe/MnhzwOoNUxZha5Honh7jPsk1fdFEe/4PP\n5qm5ryZJfnDgu6t1awAAAFZIVwjdlmRnmvdek+SeJH+ds4tA/V6SuTSPESfJgSRvS3LFEuceSDKb\n5MUklyf5epJrFrj/wSRXDvRJAAAAqORQmvpLF8S69gZbk7wmyVNZuADUI+36tiR/NsC5n0hyV7t+\nd5KPr3jPAQAAmGjvSPI/08yS3tPuu7NdTrm/Pf50kl/oODdpvprnj5N8P8lX0hSNAgAAAAAAAGA1\nbU/zXu1zOfM4MuPls2kqUT/Ts28mzfcHLzTjfk+a8TyQ5NdXqY8sbEua99S/m+Q7ST7Y7jd+NfxM\nmq9BeyrJ95J8rN1v/OqYTvJkki+328aujueTfDvN+P15u8/41XBJki8leTbN/zvfEmNXxd9J83fu\n1PKTNL+7GL8a7knzO+czSf5zkr+RNT5202keS96aZH0Wfk+X0fuVJNfn7DD7iSQfadfvypl3oa9L\nM47r04zrwXR/LRQXzmVJ3tyuvz7NqwDXxvhVcnH7c12aOgW/HONXye8k+YMk+9ptY1fHD9P8EtbL\n+NWwJ8k/a9fXJXlDjF1FFyX5UZp/mDd+429rkh+kCbBJ8sUk78saH7tfSvJYz/bd7cL42Zqzw+yB\nnPnO4Mva7aT5F5beGfbH0hQLYzw8nOTGGL+KLk7yrSQ/F+NXxeY0NSP+Qc7MzBq7On6Y5I19+4zf\n+HtDml+o+xm7en49yTfadeM3/mbSTJpsSPOPSF9O8vas4NiNY9LdlORwz/aRdh/j79I0jx6n/Xnq\nP9KNacbxFGM6PrammWHfH+NXyUVp/uXyWM48Mm78avj3SX43zVfVnWLs6jiZ5h8jnkjyz9t9xm/8\nXZHkL5N8Lsn/SPIfk7wuxq6if5zkC+268Rt/P07yyST/K8nRJCfSPF68YmM3jmH25Kg7wIo4maXH\n0jiP3uuT7E3yoST/p++Y8Rtvf53mUfHNSX41zSxfL+M3nn4zyUtp3vla7Hvejd14e2uafwB8R5Lf\nTvPKTS/jN57Wpfm2jf/Q/nw55z71Z+zG32uS/KMk/2WBY8ZvPF2Z5F+mmTzZmOZ3z/f0tVnW2I1j\nmH0hzXPwp2zJ2Qmd8XUszaMCSXJ5ml/aknPHdHO7j9FZnybI/n6ax4wT41fRT5L8UZJfjPGr4O8n\nuTnNo6pfSPJraf4OGrs6ftT+/Msk/y3JDTF+FRxpl2+1219KE2pfjLGr5B1J/iLN37/E370K/l6S\nbyb5qyTCeAWrAAABNklEQVSvJvmvaV4pXdN/99YlOZQmwb8mCkCNs605twDUqefc7865L3O/Js2j\nPoey+KwEF95Ukv+U5nHHXsavhp/Nmap/r03yJ0n+YYxfNW/LmXdmjV0NFyf5m+3665L89zTv7xm/\nGv4kyZva9Z1pxs3Y1fJQmuJBpxi/8ffzab4547VpxmBPmqda1vzYvSPNy8IH07wIzPj5Qppn3/9f\nmnec70jzkvcfZ+Ey2/8qzXgeSPIbq9pT+v1ymsdUn8qZMvfbY/yq+Ltp3vl6Ks1XhPxuu9/41fK2\nnKlmbOxquCLN37un0vxydur3E+NXw8+nmZl9Os3s0Bti7Cp5XZL/nTP/oJQYvyo+kjNfzbMnzdOB\nxg4AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABiN/w/dhfSVICqAoQAAAABJRU5ErkJg\ngg==\n",
       "text": [
        "<matplotlib.figure.Figure at 0x10b84c410>"
       ]
      }
     ],
     "prompt_number": 79
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "## Filling in Bad or Missing Values"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "years_pre_processed = movie_fields.map(lambda fields: fields[2]).map(lambda x: convert_year(x)).filter(lambda yr: yr != 1900).collect()\n",
      "years_pre_processed_arr = np.array(years_pre_processed)   \n",
      "# first we compute the mean and median year of release, without the 'bad' data point\n",
      "mean_year = np.mean(years_pre_processed_arr[years_pre_processed_arr!=1900])\n",
      "median_year = np.median(years_pre_processed_arr[years_pre_processed_arr!=1900])\n",
      "idx_bad_data = np.where(years_pre_processed_arr==1900)[0][0]\n",
      "years_pre_processed_arr[idx_bad_data] = median_year\n",
      "print \"Mean year of release: %d\" % mean_year\n",
      "print \"Median year of release: %d\" % median_year\n",
      "print \"Index of '1900' after assigning median: %s\" % np.where(years_pre_processed_arr == 1900)[0]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "Mean year of release: 1989\n",
        "Median year of release: 1995\n",
        "Index of '1900' after assigning median: []\n"
       ]
      }
     ],
     "prompt_number": 112
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "##Feature Extraction"
     ]
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "### Categorical Features: _1-of-k_ Encoding of User Occupation"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "all_occupations = user_fields.map(lambda fields: fields[3]).distinct().collect()\n",
      "all_occupations.sort()\n",
      "# create a new dictionary to hold the occupations, and assign the \"1-of-k\" indexes\n",
      "idx = 0\n",
      "all_occupations_dict = {}\n",
      "for o in all_occupations:\n",
      "    all_occupations_dict[o] = idx\n",
      "    idx +=1\n",
      "# try a few examples to see what \"1-of-k\" encoding is assigned\n",
      "print \"Encoding of 'doctor': %d\" % all_occupations_dict['doctor']\n",
      "print \"Encoding of 'programmer': %d\" % all_occupations_dict['programmer']"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "Encoding of 'doctor': 2\n",
        "Encoding of 'programmer': 14\n"
       ]
      }
     ],
     "prompt_number": 82
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# create a vector representation for \"programmer\" and encode it into a binary vector\n",
      "K = len(all_occupations_dict)\n",
      "binary_x = np.zeros(K)\n",
      "k_programmer = all_occupations_dict['programmer']\n",
      "binary_x[k_programmer] = 1\n",
      "print \"Binary feature vector: %s\" % binary_x\n",
      "print \"Length of binary vector: %d\" % K"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "Binary feature vector: [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.\n",
        "  0.  0.  0.]\n",
        "Length of binary vector: 21\n"
       ]
      }
     ],
     "prompt_number": 83
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "### Transforming Timestamps into Categorical Features"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# a function to extract the timestamps (in seconds) from the dataset\n",
      "def extract_datetime(ts):\n",
      "    import datetime\n",
      "    return datetime.datetime.fromtimestamp(ts)\n",
      "    \n",
      "timestamps = rating_data.map(lambda fields: int(fields[3]))\n",
      "hour_of_day = timestamps.map(lambda ts: extract_datetime(ts).hour)\n",
      "hour_of_day.take(5)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 84,
       "text": [
        "[17, 21, 9, 7, 7]"
       ]
      }
     ],
     "prompt_number": 84
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# a function for assigning \"time-of-day\" bucket given an hour of the day\n",
      "def assign_tod(hr):\n",
      "    times_of_day = {\n",
      "                'morning' : range(7, 12),\n",
      "                'lunch' : range(12, 14),\n",
      "                'afternoon' : range(14, 18),\n",
      "                'evening' : range(18, 23),\n",
      "                'night' : range(23, 7)\n",
      "                }\n",
      "    for k, v in times_of_day.iteritems():\n",
      "        if hr in v: \n",
      "            return k\n",
      "\n",
      "# now apply the \"time of day\" function to the \"hour of day\" RDD\n",
      "time_of_day = hour_of_day.map(lambda hr: assign_tod(hr))\n",
      "time_of_day.take(5)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 85,
       "text": [
        "['afternoon', 'evening', 'morning', 'morning', 'morning']"
       ]
      }
     ],
     "prompt_number": 85
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "### Simple Text Feature Extraction"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# we define a function to extract just the title from the raw movie title, removing the year of release\n",
      "def extract_title(raw):\n",
      "    import re\n",
      "    grps = re.search(\"\\((\\w+)\\)\", raw)    # this regular expression finds the non-word (numbers) between parentheses\n",
      "    if grps:\n",
      "        return raw[:grps.start()].strip() # we strip the trailing whitespace from the title\n",
      "    else:\n",
      "        return raw\n",
      "\n",
      "# first lets extract the raw movie titles from the movie fields\n",
      "raw_titles = movie_fields.map(lambda fields: fields[1])\n",
      "# next, we strip away the \"year of release\" to leave us with just the title text\n",
      "# let's test our title extraction function on the first 5 titles\n",
      "for raw_title in raw_titles.take(5):\n",
      "    print extract_title(raw_title)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "Toy Story\n",
        "GoldenEye\n",
        "Four Rooms\n",
        "Get Shorty\n",
        "Copycat\n"
       ]
      }
     ],
     "prompt_number": 89
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# ok that looks good! let's apply it to all the titles\n",
      "movie_titles = raw_titles.map(lambda m: extract_title(m))\n",
      "# next we tokenize the titles into terms. We'll use simple whitespace tokenization\n",
      "title_terms = movie_titles.map(lambda t: t.split(\" \"))\n",
      "print title_terms.take(5)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "[[u'Toy', u'Story'], [u'GoldenEye'], [u'Four', u'Rooms'], [u'Get', u'Shorty'], [u'Copycat']]\n"
       ]
      }
     ],
     "prompt_number": 90
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# next we would like to collect all the possible terms, in order to build out dictionary of term <-> index mappings\n",
      "all_terms = title_terms.flatMap(lambda x: x).distinct().collect()\n",
      "# create a new dictionary to hold the terms, and assign the \"1-of-k\" indexes\n",
      "idx = 0\n",
      "all_terms_dict = {}\n",
      "for term in all_terms:\n",
      "    all_terms_dict[term] = idx\n",
      "    idx +=1\n",
      "num_terms = len(all_terms_dict)\n",
      "print \"Total number of terms: %d\" % num_terms\n",
      "print \"Index of term 'Dead': %d\" % all_terms_dict['Dead']\n",
      "print \"Index of term 'Rooms': %d\" % all_terms_dict['Rooms']"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "Total number of terms: 2645\n",
        "Index of term 'Dead': 147\n",
        "Index of term 'Rooms': 1963\n"
       ]
      }
     ],
     "prompt_number": 96
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# we could also use Spark's 'zipWithIndex' RDD function to create the term dictionary\n",
      "all_terms_dict2 = title_terms.flatMap(lambda x: x).distinct().zipWithIndex().collectAsMap()\n",
      "print \"Index of term 'Dead': %d\" % all_terms_dict2['Dead']\n",
      "print \"Index of term 'Rooms': %d\" % all_terms_dict2['Rooms']"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "Index of term 'Dead': 147\n",
        "Index of term 'Rooms': 1963\n"
       ]
      }
     ],
     "prompt_number": 97
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# this function takes a list of terms and encodes it as a scipy sparse vector using an approach \n",
      "# similar to the 1-of-k encoding\n",
      "def create_vector(terms, term_dict):\n",
      "    from scipy import sparse as sp\n",
      "    x = sp.csc_matrix((1, num_terms))\n",
      "    for t in terms:\n",
      "        if t in term_dict:\n",
      "            idx = term_dict[t]\n",
      "            x[0, idx] = 1\n",
      "    return x\n",
      "all_terms_bcast = sc.broadcast(all_terms_dict)\n",
      "term_vectors = title_terms.map(lambda terms: create_vector(terms, all_terms_bcast.value))\n",
      "term_vectors.take(5)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 98,
       "text": [
        "[<1x2645 sparse matrix of type '<type 'numpy.float64'>'\n",
        " \twith 2 stored elements in Compressed Sparse Column format>,\n",
        " <1x2645 sparse matrix of type '<type 'numpy.float64'>'\n",
        " \twith 1 stored elements in Compressed Sparse Column format>,\n",
        " <1x2645 sparse matrix of type '<type 'numpy.float64'>'\n",
        " \twith 2 stored elements in Compressed Sparse Column format>,\n",
        " <1x2645 sparse matrix of type '<type 'numpy.float64'>'\n",
        " \twith 2 stored elements in Compressed Sparse Column format>,\n",
        " <1x2645 sparse matrix of type '<type 'numpy.float64'>'\n",
        " \twith 1 stored elements in Compressed Sparse Column format>]"
       ]
      }
     ],
     "prompt_number": 98
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "## Normalizing Features"
     ]
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "### Scaling the Norm of Vectors"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "np.random.seed(42)\n",
      "x = np.random.randn(10)\n",
      "norm_x_2 = np.linalg.norm(x)\n",
      "normalized_x = x / norm_x_2\n",
      "print \"x:\\n%s\" % x\n",
      "print \"2-Norm of x: %2.4f\" % norm_x_2\n",
      "print \"Normalized x:\\n%s\" % normalized_x\n",
      "print \"2-Norm of normalized_x: %2.4f\" % np.linalg.norm(normalized_x)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "x:\n",
        "[ 0.49671415 -0.1382643   0.64768854  1.52302986 -0.23415337 -0.23413696\n",
        "  1.57921282  0.76743473 -0.46947439  0.54256004]\n",
        "2-Norm of x: 2.5908\n",
        "Normalized x:\n",
        "[ 0.19172213 -0.05336737  0.24999534  0.58786029 -0.09037871 -0.09037237\n",
        "  0.60954584  0.29621508 -0.1812081   0.20941776]\n",
        "2-Norm of normalized_x: 1.0000\n"
       ]
      }
     ],
     "prompt_number": 99
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      " ### Scaling the Norm of Vectors with MLlib's Normalizer"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "from pyspark.mllib.feature import Normalizer\n",
      "normalizer = Normalizer()\n",
      "vector = sc.parallelize([x])\n",
      "normalized_x_mllib = normalizer.transform(vector).first().toArray()\n",
      "\n",
      "print \"x:\\n%s\" % x\n",
      "print \"2-Norm of x: %2.4f\" % norm_x_2\n",
      "print \"Normalized x MLlib:\\n%s\" % normalized_x_mllib\n",
      "print \"2-Norm of normalized_x_mllib: %2.4f\" % np.linalg.norm(normalized_x_mllib)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "x:\n",
        "[ 0.49671415 -0.1382643   0.64768854  1.52302986 -0.23415337 -0.23413696\n",
        "  1.57921282  0.76743473 -0.46947439  0.54256004]\n",
        "2-Norm of x: 2.5908\n",
        "Normalized x MLlib:\n",
        "[ 0.19172213 -0.05336737  0.24999534  0.58786029 -0.09037871 -0.09037237\n",
        "  0.60954584  0.29621508 -0.1812081   0.20941776]\n",
        "2-Norm of normalized_x_mllib: 1.0000\n"
       ]
      }
     ],
     "prompt_number": 101
    }
   ],
   "metadata": {}
  }
 ]
}